In [25]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import unicodedata
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

In [26]:
def removeStopword(str):
    stop_words = set(stopwords.words('indonesian'))
    word_tokens = word_tokenize(str)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

#remove sentence which contains only one word
def removeSentence(str): 
    word = str.split()
    wordCount = len(word)
    if(wordCount<=1):
        str = ''
    
    return str

def cleaning(str):
    #remove non-ascii
    str = unicodedata.normalize('NFKD', str).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    #remove URLs
    str = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', str)
    #remove punctuations
    str = re.sub(r'[^\w]|_',' ',str)
    #remove digit from string
    str = re.sub("\S*\d\S*", "", str).strip()
    #remove digit or numbers
    str = re.sub(r"\b\d+\b", " ", str)
    #to lowercase
    str = str.lower()
    #Remove additional white spaces
    str = re.sub('[\s]+', ' ', str)
       
    return str


def preprocessing(str):
    str = ' '.join(re.sub(r'@[A-Za-z0-9]+','',str).split()) 
    str = removeSentence(str)
    str = cleaning(str)
    str = removeStopword(str)
    
    return str

In [27]:
# Proses pengambilan data
train = pd.ExcelFile('TRAINING_KOTOR.xlsx')
test = pd.ExcelFile('TESTING_KOTOR2.xlsx')
dtr = pd.read_excel(train, 'Sheet1')
dts = pd.read_excel(test, 'Sheet1')
tweet_t = dtr["tweet"].tolist()
label_t = dtr["label"].tolist()
tweet = dts["tweet"].tolist()
label = dts["label"].tolist()

In [28]:
# Preview Data Pelatihan
#kata_kotor = 0, bukan_kata_kotor = 1
dtr['label'] = pd.factorize(dtr.label)[0]
dtr.head()

Unnamed: 0,tweet,label
0,Kamu ini terlalu banyak bacot. Mau aku bacok kah?,0
1,"@foxluvme lo bacot kuadrat , apalagi ka oca",0
2,@kdanielsrl_ Jujur gue lupa fi ke lo kygmn hyu...,0
3,"@bynzie Zahra, 03L, Renjun\n\nBibin lucu terus...",0
4,"Halah bacot dating-dating, percaya sama dispat...",0


In [29]:
# Preview Data Pengujian
#kata_kotor = 0, bukan_kata_kotor = 1
dts['label'] = pd.factorize(dts.label)[0]
dts.head()

Unnamed: 0,tweet,label
0,Me: debat dengan provide data dan hitungan-hit...,0
1,ASE: lo ngapain masuk ngantor? kenapa gak WFH ...,0
2,@P3nj3l4j4h @natadiningrat99 Lala lama eneg ng...,0
3,@cingu24 Bacot emg tuh satu.,0
4,baru tgl 2 weh udah ada anjing bacot aja,0


In [30]:
# Proses preprocessing data pelatihan
cleaned = []
for x in tweet_t:
    smntr = preprocessing(x)
    cleaned.append(smntr)
    
len(cleaned)

4500

In [31]:
# Proses preprocessing data pengujian
cleaned_test = []
for x in tweet:
    smntr = preprocessing(x)
    cleaned_test.append(smntr)
    
len(cleaned_test)

893

In [32]:
# Klasifikasi Naive Bayes
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(cleaned)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, label_t)
blob = clf.predict(count_vect.transform(cleaned_test))

In [33]:
# Hasil prediksi dengan Naive Bayes
hasilnya = []
i = 0
while i < len(blob):
    smntr = (cleaned_test[i], label[i], blob[i])
    hasilnya.append(smntr)
    i += 1
labels = ['Tweet', 'Aktual', 'Prediksi']
df = pd.DataFrame.from_records(hasilnya, columns=labels)
df.head(10)

Unnamed: 0,Tweet,Aktual,Prediksi
0,me debat provide data hitungan hitungan bukti ...,0,0
1,ase lo ngapain masuk ngantor gak wfh aja packi...,0,0
2,lala eneg ngeliat pasangan capres ga berilmu c...,0,0
3,bacot emg tuh,0,0
4,tgl weh udah anjing bacot aja,0,0
5,haean bacot melulu orang sakit bloon,0,0
6,rpw need rp nct join gdm bacot aktif baperan u...,0,0
7,bacot wkwkw bener,0,0
8,bacot ya hean lo limit aja limit dm,0,0
9,lo bacot goblog semangat ya melawan juliders,0,0


In [34]:
# Tingkat akurasi dengan Naive Bayes
print(str('%.3f'%(np.mean(blob == label)*100)) + "%")

97.872%


In [35]:
# Menghitung Precision, Recall dan F-score Naive Bayes
print (classification_report(label, blob))

             precision    recall  f1-score   support

          0       0.96      1.00      0.98       446
          1       1.00      0.96      0.98       447

avg / total       0.98      0.98      0.98       893



In [36]:
# Klasifikasi SVM
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', LinearSVC())])
text_clf_svm = text_clf_svm.fit(cleaned, label_t)
predicted = text_clf_svm.predict(cleaned_test)

In [37]:
# Hasil prediksi dengan Naive Bayes
hasil_p = []
i = 0
while i < len(predicted):
    smntr = (cleaned_test[i], label[i], predicted[i])
    hasil_p.append(smntr)
    i += 1
labels = ['Tweet', 'Aktual', 'Prediksi']
df = pd.DataFrame.from_records(hasil_p, columns=labels)
df.head(10)

Unnamed: 0,Tweet,Aktual,Prediksi
0,me debat provide data hitungan hitungan bukti ...,0,0
1,ase lo ngapain masuk ngantor gak wfh aja packi...,0,0
2,lala eneg ngeliat pasangan capres ga berilmu c...,0,0
3,bacot emg tuh,0,0
4,tgl weh udah anjing bacot aja,0,0
5,haean bacot melulu orang sakit bloon,0,0
6,rpw need rp nct join gdm bacot aktif baperan u...,0,0
7,bacot wkwkw bener,0,0
8,bacot ya hean lo limit aja limit dm,0,0
9,lo bacot goblog semangat ya melawan juliders,0,0


In [38]:
# Hasil prediksi dengan SVM
print(str('%.3f'%(np.mean(predicted == label)*100)) + "%")

96.976%


In [39]:
# Menghitung Precision, Recall dan F-score SVM
print (classification_report(label, predicted))

             precision    recall  f1-score   support

          0       0.99      0.95      0.97       446
          1       0.95      0.99      0.97       447

avg / total       0.97      0.97      0.97       893

