### Open .csv file

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./dataset/dataTest.csv', sep=';', encoding='utf-8')
df

Unnamed: 0,comment,label
0,@sn5301679 pertanyaanya negara ini sudah maju...,NEGATIF
1,@deppaykurniawan2016 Makanya itulah pentingn...,POSITIF
2,@sn5301679 isu sensitif dan sering jadi komo...,NEGATIF
3,@vivifitriasari pernah pengalaman kerja di p...,NEGATIF
4,@deppaykurniawan2016 Wah kebetulan bgt ya ma...,POSITIF
5,@deppaykurniawan2016 Wah kebetulan bgt ya ma...,POSITIF
6,@deppaykurniawan2016 Wah kebetulan bgt ya ma...,POSITIF
7,@deppaykurniawan2016 Wah kebetulan bgt ya ma...,POSITIF
8,@deppaykurniawan2016 Wah kebetulan bgt ya ma...,POSITIF
9,@deppaykurniawan2016 Wah kebetulan bgt ya ma...,POSITIF


In [3]:
# 'ind' or 'en'
LANGUAGE='ind'

### Cleaning

In [4]:
import re
import os

In [5]:
def cleanComment(comment, language):
    try:
        comment = re.sub(r'@\w+|[^\w\s]', '', comment)
        comment = re.sub(r'\d+', '', comment)
       
        slang_dict = {}
        filename = f'slang-{language}.txt'
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                for line in f:
                    slang, formal = line.strip().split(',')
                    slang_dict[slang] = formal

        words = comment.split()
        for i in range(len(words)):
            if words[i] in slang_dict:
                words[i] = slang_dict[words[i]]
        comment = ' '.join(words)

    except:
        comment = ''
        print("Err: Failed to clean comments")
    return comment.strip()


In [6]:
df['cleanComment'] = df['comment'].apply(lambda x: cleanComment(x, LANGUAGE))
df.head()

Unnamed: 0,comment,label,cleanComment
0,@sn5301679 pertanyaanya negara ini sudah maju...,NEGATIF,pertanyaanya negara ini sudah maju atau blm In...
1,@deppaykurniawan2016 Makanya itulah pentingn...,POSITIF,Makanya itulah pentingnya adaptasi Mau gimana ...
2,@sn5301679 isu sensitif dan sering jadi komo...,NEGATIF,isu sensitif dan sering jadi komoditas politik...
3,@vivifitriasari pernah pengalaman kerja di p...,NEGATIF,pernah pengalaman kerja di perusahaan Teknolog...
4,@deppaykurniawan2016 Wah kebetulan bgt ya ma...,POSITIF,Wah kebetulan bgt ya mas saya pernah kerja di ...


### Case folding

In [7]:
def caseFolding(cleanComment):
    try:
        cleanComment = cleanComment.lower()
    except:
        cleanComment = ''
        print("Err: Failed to case folding")
    return cleanComment

In [8]:
df['resultCaseFolding'] = df['cleanComment'].apply(caseFolding)
df[['cleanComment', 'resultCaseFolding']].head()

Unnamed: 0,cleanComment,resultCaseFolding
0,pertanyaanya negara ini sudah maju atau blm In...,pertanyaanya negara ini sudah maju atau blm in...
1,Makanya itulah pentingnya adaptasi Mau gimana ...,makanya itulah pentingnya adaptasi mau gimana ...
2,isu sensitif dan sering jadi komoditas politik...,isu sensitif dan sering jadi komoditas politik...
3,pernah pengalaman kerja di perusahaan Teknolog...,pernah pengalaman kerja di perusahaan teknolog...
4,Wah kebetulan bgt ya mas saya pernah kerja di ...,wah kebetulan bgt ya mas saya pernah kerja di ...


### Tokenizing

In [10]:
def tokenize(comments):
    try:
        words = comments.split(' ')
        words = list(filter(None, words))  # hapus kata yang kosong
    except Exception as e:
        print("Err: Failed to tokenize due to", str(e))
        words = []
    return words

In [11]:
df['resultTokenize'] = df['resultCaseFolding'].apply(tokenize)
df[['resultCaseFolding', 'resultTokenize']].head()

Unnamed: 0,resultCaseFolding,resultTokenize
0,pertanyaanya negara ini sudah maju atau blm in...,"[pertanyaanya, negara, ini, sudah, maju, atau,..."
1,makanya itulah pentingnya adaptasi mau gimana ...,"[makanya, itulah, pentingnya, adaptasi, mau, g..."
2,isu sensitif dan sering jadi komoditas politik...,"[isu, sensitif, dan, sering, jadi, komoditas, ..."
3,pernah pengalaman kerja di perusahaan teknolog...,"[pernah, pengalaman, kerja, di, perusahaan, te..."
4,wah kebetulan bgt ya mas saya pernah kerja di ...,"[wah, kebetulan, bgt, ya, mas, saya, pernah, k..."


In [12]:
# !pip install nltk

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Dell
[nltk_data]     7300\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
def stopwordRemoval(comments, language):
    language_mapping = {
        'en': 'english', 
        'ind': 'indonesian'
    }

    nltk_language = language_mapping.get(language, 'english') 

    stopWordRemoved = []
    try:
        stopList = stopwords.words(nltk_language)
        filename = f'stopwords-{language}.txt'
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            with open(filename, 'r') as f:
                stopList.extend(f.read().split('\n')) 
        for word in comments:
            if word not in stopList:
                stopWordRemoved.append(word)
    except:
        print("Err: Failed to remove stopwords")
        
    return stopWordRemoved

In [14]:
df['resultStopword'] = df['resultTokenize'].apply(lambda x: stopwordRemoval(x, LANGUAGE))
df[['resultTokenize', 'resultStopword']].head()

Unnamed: 0,resultTokenize,resultStopword
0,"[pertanyaanya, negara, ini, sudah, maju, atau,...","[pertanyaanya, negara, maju, blm, indonesia, t..."
1,"[makanya, itulah, pentingnya, adaptasi, mau, g...","[adaptasi, gimana, keadaannya, adaptasi, berta..."
2,"[isu, sensitif, dan, sering, jadi, komoditas, ...","[isu, sensitif, komoditas, politik, win, solut..."
3,"[pernah, pengalaman, kerja, di, perusahaan, te...","[pengalaman, kerja, perusahaan, teknologi, ind..."
4,"[wah, kebetulan, bgt, ya, mas, saya, pernah, k...","[bgt, ya, mas, kerja, batam, tamatan, smp, sma..."


### Hapus data kosong ([])

In [15]:
df = df[df['resultStopword'].apply(lambda x: len(x) > 0)]
df['resultStopword']

0     [pertanyaanya, negara, maju, blm, indonesia, t...
1     [adaptasi, gimana, keadaannya, adaptasi, berta...
2     [isu, sensitif, komoditas, politik, win, solut...
3     [pengalaman, kerja, perusahaan, teknologi, ind...
4     [bgt, ya, mas, kerja, batam, tamatan, smp, sma...
5     [bgt, ya, mas, kerja, batam, tamatan, smp, sma...
6     [bgt, ya, mas, kerja, batam, tamatan, smp, sma...
7     [bgt, ya, mas, kerja, batam, tamatan, smp, sma...
8     [bgt, ya, mas, kerja, batam, tamatan, smp, sma...
9     [bgt, ya, mas, kerja, batam, tamatan, smp, sma...
10    [bgt, ya, mas, kerja, batam, tamatan, smp, sma...
11    [o, gawe, rubycon, kawasan, industri, muka, ku...
12    [sorry, bro, merendahkan, orang, tua, keras, k...
13    [sorry, bro, merendahkan, orang, tua, keras, k...
14    [sorry, bro, merendahkan, orang, tua, keras, k...
15    [sorry, bro, merendahkan, orang, tua, keras, k...
16    [sorry, bro, merendahkan, orang, tua, keras, k...
Name: resultStopword, dtype: object

### Stemming

In [16]:
# !pip install Sastrawi

from nltk.stem import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [17]:
def stemmed_wrapper(term, language):
    if language == 'ind':
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        return stemmer.stem(term)
    else: 
        stemmer = PorterStemmer()
        return stemmer.stem(term)

def stemming(document, language):
    return [stemmed_wrapper(term, language) for term in document]


In [18]:
df['resultStemming'] = df['resultStopword'].apply(lambda x: stemming(x, LANGUAGE))
df[['resultStopword', 'resultStemming']].head()

Unnamed: 0,resultStopword,resultStemming
0,"[pertanyaanya, negara, maju, blm, indonesia, t...","[pertanyaanya, negara, maju, blm, indonesia, t..."
1,"[adaptasi, gimana, keadaannya, adaptasi, berta...","[adaptasi, gimana, ada, adaptasi, tahan, bandi..."
2,"[isu, sensitif, komoditas, politik, win, solut...","[isu, sensitif, komoditas, politik, win, solut..."
3,"[pengalaman, kerja, perusahaan, teknologi, ind...","[alam, kerja, usaha, teknologi, indonesia, lan..."
4,"[bgt, ya, mas, kerja, batam, tamatan, smp, sma...","[bgt, ya, mas, kerja, batam, tamat, smp, sma, ..."


### Export clean data

In [19]:
df.to_csv(
    './output/data-clean-ind-test.csv', 
    columns=[
        'comment', 
        'cleanComment',
        'resultCaseFolding',
        'resultTokenize',
        'resultStopword',
        'resultStemming',
    ],
    index=False
)