In [1]:
# Library
import re
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from nltk.classify import util
from sklearn import linear_model
from numpy import array
from sklearn.naive_bayes import ComplementNB
import pandas as pd
import pickle

In [2]:
# Gensim
import gensim
from gensim.utils import simple_preprocess

In [3]:
factory = StopWordRemoverFactory()
more_stopword = ['di',"ke",'ber','mah','nya','pas','in','an','se']
sastrawi_stopword = factory.get_stop_words()+more_stopword
# sastrawi_stopword = factory.get_stop_words()
    
# create path url for each stopword
path_stopwords = []

# combine stopwords
stopwords_l = sastrawi_stopword
for path in path_stopwords:
    response = requests.get(path)
    stopwords_l += response.text.split('\n')

# create dictionary with unique stopword
st_words = set(stopwords_l)

# result stopwords
stop_words = st_words

In [4]:
print(st_words)

{'tapi', 'sedangkan', 'kemana', 'sekitar', 'agak', 'daripada', 'masih', 'sambil', 'se', 'tetapi', 'antara', 'kecuali', 'kami', 'bisa', 'bahwa', 'anda', 'mari', 'demikian', 'namun', 'apakah', 'dst', 'setidaknya', 'nya', 'walau', 'apalagi', 'adalah', 'di', 'seraya', 'sementara', 'dahulu', 'yang', 'saja', 'kenapa', 'hanya', 'setelah', 'hal', 'seperti', 'setiap', 'guna', 'boleh', 'ok', 'oleh', 'lain', 'dalam', 'mah', 'melainkan', 'yakni', 'in', 'atau', 'tolong', 'supaya', 'ya', 'kita', 'anu', 'tanpa', 'seharusnya', 'yaitu', 'ber', 'pas', 'belum', 'seolah', 'bagaimanapun', 'demi', 'lagi', 'kah', 'dulunya', 'maka', 'ia', 'serta', 'sampai', 'bagi', 'dsb', 'dari', 'secara', 'amat', 'toh', 'para', 'terhadap', 'mengapa', 'sebab', 'dapat', 'sebelum', 'an', 'pun', 'nggak', 'oh', 'untuk', 'sudah', 'dimana', 'saat', 'itu', 'harus', 'kepada', 'karena', 'selagi', 'saya', 'pasti', 'begitu', 'ketika', 'telah', 'ingin', 'ke', 'menurut', 'dia', 'agar', 'tidak', 'jika', 'dua', 'itulah', 'kembali', 'sesuatu

In [4]:
# Function Preprocessing

def case_folding(text):
    text = text.lower() # lowercase
    return text

def emoji(text):
    text = re.sub(r'[^\x00-\x7f]', r'', text) # Remove non ASCII chars
    text = re.sub(r'(\\u[0-9A-Fa-f]+)', r'', text)
    return text

def cleaning_text(text):
    # Cleaning text
    text = re.sub(r'@[\w]*', ' ', text) # Remove mention handle user (@)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r'\\u\w\w\w\w', '', text) # Remove link web
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'#([^\s]+)', '', text) # Remove #tagger
    text = re.sub(r"[.,:;+!\-_<^/=?\"'\(\)\d\*]", " ", text) # Remove simbol, angka dan karakter aneh
    return text

def replaceThreeOrMore(text):
    # Pattern to look for three or more repetitions of any character, including newlines (contoh goool -> gol).
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1", text) #2 atau lebih
    # return pattern.sub(r"\1\1", text) #3 atau lebih

def tokenize(text):
    return word_tokenize(text)

def convertToSlangword(text):
    kamus_slangword = eval(open("kamus_normalisasi_baru.txt").read()) # Membuka dictionary slangword
    pattern = re.compile(r'\b( ' + '|'.join (kamus_slangword.keys())+r')\b') # Search pola kata (contoh kpn -> kapan)
    content = [] # menginisialisasi data kosong untuk hasil normalisasi
    for kata in text: #iterasi melaui setiap kata dalam teks
        filteredSlang = pattern.sub(lambda x: kamus_slangword[x.group()],kata) # mengganti slangword berdasarkan pola regex yg telah ditentukan dan kamus slangword
        content.append(filteredSlang.lower())#menambahkan kata yang telah di word normalisasi kedalam daftar content hurufnya kecil
    text = content #mengganti text dengan content yang telah di word normalisasi
    return text

def remove_stopword(text, stop_words=stop_words):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

def stemming_and_lemmatization(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(text)

In [5]:
# Skema Preprocessing

def prepro_1(data):
    data['text'] = data['comment'].astype(str)
    data['Clean_Twt'] = data['comment'].apply(case_folding)
    data['Clean_Twt'] = data['Clean_Twt'].apply(emoji)
    data['Clean_Twt'] = data['Clean_Twt'].apply(cleaning_text)
    data['Clean_Twt'] = data['Clean_Twt'].astype(str)
    data['Repeat'] = data['Clean_Twt'].apply(replaceThreeOrMore)
    data['Tokenize_Tweet'] = data['Repeat'].apply(tokenize).astype(str)
    data['Slang_Tweet'] = data['Tokenize_Tweet'].apply(convertToSlangword)
    data['Slang_Tweet'] = data['Tokenize_Tweet'].apply(" ".join)
    data['Stem'] = data['Slang_Tweet'].apply(stemming_and_lemmatization)
    # data['Negasi'] = data['Slang_Tweet'].apply(ganti_negasi)
    data['Stopwords'] = data['Stem'].apply(remove_stopword)
    data['comment'] = data['Stopwords']
    data = data[['comment','Sentiments','Label']]
    return data
    
def prepro_2(data):
    # data = str[data]
    data['text'] = data['comment'].astype(str)
    data['Clean_Twt'] = data['comment'].apply(case_folding)
    data['Clean_Twt'] = data['Clean_Twt'].apply(emoji)
    data['Clean_Twt'] = data['Clean_Twt'].apply(cleaning_text)
    data['Clean_Twt'] = data['Clean_Twt'].astype(str)
    data['Repeat'] = data['Clean_Twt'].apply(replaceThreeOrMore)
    data['Tokenize_Tweet'] = data['Repeat'].apply(tokenize)
    # data['Slang_Tweet'] = data['Tokenize_Tweet'].apply(convertToSlangword)
    data['Slang_Tweet'] = data['Tokenize_Tweet'].apply(" ".join)
    data['Stem'] = data['Slang_Tweet'].apply(stemming_and_lemmatization)
    # data['Negasi'] = data['Slang_Tweet'].apply(ganti_negasi)
    data['Stopwords'] = data['Stem'].apply(remove_stopword)
    data['comment'] = data['Stopwords']
    data = data[['comment','Sentiments','Label']]
    return data



In [330]:
# Skema Preprocessing ouput satu-satu

def prepro_3(data):
    data['text'] = data['comment'].astype(str)
    data['Clean_Twt'] = data['comment'].apply(case_folding)
    data['Clean_Twt'] = data['Clean_Twt'].apply(emoji)
    data['Clean_Twt'] = data['Clean_Twt'].apply(cleaning_text)
    data['Clean_Twt'] = data['Clean_Twt'].astype(str)
    data['Repeat'] = data['Clean_Twt'].apply(replaceThreeOrMore)
    data['Tokenize_Tweet'] = data['Repeat'].apply(tokenize)
    data['Slang_Tweet'] = data['Tokenize_Tweet'].apply(convertToSlangword)
    data['Slang_Tweet'] = data['Slang_Tweet'].apply(" ".join)
    data['Stem'] = data['Slang_Tweet'].apply(stemming_and_lemmatization)
    # data['Negasi'] = data['Slang_Tweet'].apply(ganti_negasi)
    data['Stopwords'] = data['Stem'].apply(remove_stopword)
    data['comment'] = data['Stopwords']
    data = data[['comment','Sentiments','Label']]
    return data

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Read data
data = pd.read_csv('DataHasilLabelingFix.csv', encoding='latin-1')
data = data[['comment','Sentiments','Label']]
# data = data[['comment']]
data.head(10)

Unnamed: 0,comment,Sentiments,Label
0,ini sama kaya anak aku kmrn cuma dikasih obat...,1,positif
1,Anak saya waktu demam diatas 40 sama pas test ...,1,positif
2,@cu.suminar sekarang keadaannya gmn mom? Semog...,1,positif
3,alhamdulillah skrg sudah sehat lagi momin ber...,1,positif
4,betul.. anak2 saya semua ga sampe 40 tertanga...,1,positif
5,"anak harus d daftar kn dl BPJS drmom, jd terp...",1,positif
6,Mudah2 ank2 kita semua di jauhakan dri segala ...,1,positif
7,Patah tulang jg kak. Alhamdulillah kemarin ana...,1,positif
8,@permaditya alhamdulillah semoga mom dan si ke...,1,positif
9,"Anak waktu itu datang kondisinya menggigil, te...",1,positif


In [8]:
data['Label'].value_counts()

positif    350
netral     350
negatif    350
Name: Label, dtype: int64

In [9]:
data.Label.unique()
data['Sentiments'] = data['Label'].map({'positif': 1, 'netral': 0, 'negatif': -1})
data = data[['comment','Label', 'Sentiments']]
data.head(5)

Unnamed: 0,comment,Label,Sentiments
0,ini sama kaya anak aku kmrn cuma dikasih obat...,positif,1
1,Anak saya waktu demam diatas 40 sama pas test ...,positif,1
2,@cu.suminar sekarang keadaannya gmn mom? Semog...,positif,1
3,alhamdulillah skrg sudah sehat lagi momin ber...,positif,1
4,betul.. anak2 saya semua ga sampe 40 tertanga...,positif,1


**Preprocessing**

In [29]:
prepro_1(data)

NameError: name 'prepro_1' is not defined

In [69]:
dt_gabungan = prepro_1(data)
dt_gabungan.to_csv("DataHasilPrepro1.1.csv", index=False)

In [34]:
import pandas as pd

In [38]:
data = pd.read_csv('DataHasilPreproBPJS.csv')
data

Unnamed: 0,comment,Sentiments,Label
0,sama kaya anak aku kemarin kasih obat plg alha...,1,positif
1,anak waktu demam atas sama pas test darah semu...,1,positif
2,suminar sekarang bagaimana mom moga sehat momi...,1,positif
3,alhamdulilah sekarang sehat momin berkat bpjs ...,1,positif
4,betul anak semua tangan baik bpjs,1,positif
...,...,...,...
1045,bukan pakai bpjs langsung rsud kalau kalau dar...,-1,negatif
1046,lalu habis lihat total harga periksa sana sini...,-1,negatif
1047,bpjs memang laknat rakyat rasa lelah lima tahu...,-1,negatif
1048,astaghfirulah adzim peraturanya kok tambah tol...,-1,negatif


In [39]:
data['Sentiments'] = data['Label'].map({'positif': 1, 'netral': 0, 'negatif': -1})
data = data[['comment', 'Sentiments']]
data.head(5)

Unnamed: 0,comment,Sentiments
0,sama kaya anak aku kemarin kasih obat plg alha...,1
1,anak waktu demam atas sama pas test darah semu...,1
2,suminar sekarang bagaimana mom moga sehat momi...,1
3,alhamdulilah sekarang sehat momin berkat bpjs ...,1
4,betul anak semua tangan baik bpjs,1


**Pembagian Data**

In [40]:
import random
from sklearn.model_selection import train_test_split

In [41]:
# Memisahkan data test dan train

x_train,x_test,y_train, y_test = train_test_split(data['comment'], data['Sentiments'], test_size = 0.2, random_state = 42 )
# random_state = 50 menyatakan adanya pengacakan pada data yang di split

In [42]:
print('X train :', len(x_train))
print('X test :', len(x_test))
print('y train :', len(y_train))
print('y test :', len(y_test))

X train : 840
X test : 210
y train : 840
y test : 210


In [43]:
df_train= pd.DataFrame()
df_train['comment'] = x_train
df_train['Sentiments'] = y_train

df_test = pd.DataFrame()
df_test['comment'] = x_test
df_test['Sentiments'] = y_test

In [44]:
len(df_train)

840

In [45]:
len(df_test)

210

In [47]:
df_test["Sentiments"].value_counts()

 1    76
 0    72
-1    62
Name: Sentiments, dtype: int64

In [48]:
df_train.to_csv(r"data_train_prepro1.csv")
df_test.to_csv(r"data_test_prepro1.csv")

In [51]:
df_train

Unnamed: 0,comment,Sentiments
554,salam sehat sahabat mohon maaf atas ketidaknya...,0
1012,mau operasi jantung daftar tunggu lama bulan b...,-1
481,kok mending besar iuranya sama semua mampu uru...,0
432,kalau bayar mandiri bagaimana,0
626,halo admin mau tanya kalau mau update kartu as...,0
...,...,...
330,bapak kurang lebih tahun lalu opname tindak be...,1
466,kerja bagaimana,0
121,kalau wajib punya asuransi bpjs sama bagus,1
1044,tenaga sehat jahat lebih ting berkas ketimbang...,-1


**TF-IDF**

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
# Word Vectorization (TF-IDF)
# untuk mendapatkan 50000 top term dengan term frequency terbesar.
tfidf_vect = TfidfVectorizer(max_features = 5000,
                            lowercase=False)
train_X_tfidf = tfidf_vect.fit_transform(df_train['comment']) #melatih model training
test_X_tfidf = tfidf_vect.transform(df_test['comment'])

In [54]:
# Save to pickle dengan word normalization
tfidf = tfidf_vect
pickle.dump(tfidf.vocabulary_,open("tfidfBPJS1.sav","wb"))


In [24]:
# Save to pickle tanpa word normalization
tfidf = tfidf_vect
pickle.dump(tfidf.vocabulary_,open("tfidfBPJS2.sav","wb"))

In [55]:
tfidf_vect

In [56]:
print(train_X_tfidf)

  (0, 1731)	0.2621501841988313
  (0, 930)	0.15868557708036207
  (0, 2120)	0.17141674804124063
  (0, 456)	0.27402989063563615
  (0, 1803)	0.1445510166061068
  (0, 2067)	0.21678346612003593
  (0, 249)	0.2907733964566314
  (0, 161)	0.21041788063959668
  (0, 534)	0.14638663286623568
  (0, 850)	0.22866317255684085
  (0, 1253)	0.22431235574932096
  (0, 769)	0.2529355680071211
  (0, 435)	0.2529355680071211
  (0, 1091)	0.17258005219008637
  (0, 1786)	0.15391552249552068
  (0, 999)	0.2621501841988313
  (0, 137)	0.21041788063959668
  (0, 1171)	0.24540667837783606
  (0, 1314)	0.1978072778413867
  (0, 1800)	0.19568914349152083
  (0, 1865)	0.10250625500119069
  (0, 1807)	0.19568914349152083
  (1, 1239)	0.35907611372891834
  (1, 1871)	0.37994773067238874
  (1, 337)	0.4792978495937372
  :	:
  (837, 163)	0.4260738078631673
  (837, 901)	0.2834952185678024
  (837, 309)	0.1503314823678718
  (837, 1812)	0.32151693876855897
  (838, 2148)	0.43474247131077703
  (838, 1001)	0.39194720122950977
  (838, 259)	0.

In [57]:
print(test_X_tfidf)

  (0, 2130)	0.19526695741642816
  (0, 1927)	0.18715713569069542
  (0, 1865)	0.31913357023141703
  (0, 1828)	0.2052783438908072
  (0, 1812)	0.2585047174640593
  (0, 1773)	0.2791743499085668
  (0, 1242)	0.3017557388357818
  (0, 1116)	0.12887142966226198
  (0, 882)	0.24806953620368483
  (0, 862)	0.1755740495228934
  (0, 770)	0.1935175619403686
  (0, 610)	0.5687596810403391
  (0, 596)	0.2030802088362045
  (0, 309)	0.12086889581713123
  (0, 169)	0.14586975515497963
  (1, 2192)	0.33906727705933914
  (1, 2100)	0.31954288109899964
  (1, 2049)	0.23893575100628797
  (1, 1948)	0.30569010170832206
  (1, 1865)	0.23906256341431079
  (1, 1366)	0.2615678758838307
  (1, 1309)	0.30569010170832206
  (1, 1177)	0.2949450512348477
  (1, 904)	0.2615678758838307
  (1, 831)	0.21198852420226463
  :	:
  (207, 807)	0.19119206013600795
  (207, 731)	0.3116524719065012
  (207, 631)	0.47213349624670314
  (207, 596)	0.22255603267561
  (207, 309)	0.0662302399655271
  (207, 267)	0.4319844914446401
  (207, 211)	0.1424941

In [58]:
print(train_X_tfidf.shape)
print(test_X_tfidf.shape)

(840, 2329)
(210, 2329)


In [59]:
print(tfidf_vect.vocabulary_) #jumlah term dalam bentuk vector

{'salam': 1807, 'sehat': 1865, 'sahabat': 1800, 'mohon': 1314, 'maaf': 1171, 'atas': 137, 'ketidaknyamananya': 999, 'rujuk': 1786, 'laku': 1091, 'dasar': 435, 'indikasi': 769, 'medis': 1253, 'jelas': 850, 'dokter': 534, 'bagai': 161, 'bentuk': 249, 'tangan': 2067, 'sakit': 1803, 'derita': 456, 'terima': 2120, 'kasih': 930, 'rein': 1731, 'mau': 1241, 'operasi': 1454, 'jantung': 836, 'daftar': 423, 'tunggu': 2201, 'lama': 1093, 'bulan': 337, 'sekarat': 1871, 'mati': 1239, 'kok': 1029, 'mending': 1262, 'besar': 267, 'iuranya': 808, 'sama': 1812, 'semua': 1899, 'mampu': 1199, 'urus': 2240, 'langsung': 1100, 'bpjs': 309, 'bukti': 336, 'tinggal': 2150, 'sistem': 1957, 'atur': 140, 'kolom': 1031, 'kalau': 901, 'bayar': 211, 'mandiri': 1206, 'bagaimana': 162, 'halo': 689, 'admin': 15, 'tanya': 2077, 'update': 2236, 'kartu': 924, 'askes': 130, 'cara': 366, 'gimana': 663, 'alhamdulilah': 61, 'kmren': 1025, 'mulai': 1331, 'pemeriksan': 1545, 'lahir': 1086, 'ful': 626, 'cover': 408, 'layan': 1116,

In [60]:
len(tfidf_vect.vocabulary_)

2329

## K-Nearest Neighbor

In [23]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

In [24]:
model_KNN = KNeighborsClassifier(n_neighbors=3) #inisialisasi dengan k=3
KNN4no = model_KNN.fit(train_X_tfidf,y_train) #melatih model knn menggunakan data pelatihan

y_pred_KNN2 = KNN4no.predict(test_X_tfidf) #melakukan prediksi model knn yang telah dilatih
# y_prob_KNN2 = KNN2.decision_function(test_X_tfidf)

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import time

# Inisialisasi model KNN dengan 3 tetangga terdekat
model_KNN = KNeighborsClassifier(n_neighbors=3)

# Training model KNN
start_time = time.time()
KNN4no = model_KNN.fit(train_X_tfidf, y_train)
end_time = time.time()

training_time = end_time - start_time

# Prediksi dengan model KNN
start_time = time.time()
y_pred_KNN2 = KNN4no.predict(test_X_tfidf)
end_time = time.time()

inference_time = end_time - start_time

# Mengukur akurasi
accuracy = accuracy_score(y_test, y_pred_KNN2)

print("Akurasi:", accuracy)
print("Waktu Pelatihan:", training_time, "detik")



Akurasi: 0.8047619047619048
Waktu Pelatihan: 0.0 detik
Waktu Prediksi: 0.05445432662963867 detik


In [116]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model_KNN.predict(test_X_tfidf)

CM = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

          -1       0.79      0.79      0.79        62
           0       0.77      0.88      0.82        72
           1       0.86      0.75      0.80        76

    accuracy                           0.80       210
   macro avg       0.81      0.81      0.80       210
weighted avg       0.81      0.80      0.80       210



In [50]:
acc_KNN2 = accuracy_score(df_test['Sentiments'], y_pred_KNN2)
precision_KNN2 = precision_score(df_test['Sentiments'], y_pred_KNN2, average='weighted')
recall_KNN2 = recall_score(df_test['Sentiments'], y_pred_KNN2, average='weighted')
f1_KNN2 = f1_score(df_test['Sentiments'], y_pred_KNN2, average='weighted')

# Result
print("Accuracy: {:.2f}".format(acc_KNN2*100),end='\n\n')
print("Precision: ", precision_KNN2,end='\n\n')
print("Recall: ", recall_KNN2,end='\n\n')
print("F1-Score: ", f1_KNN2,end='\n\n')

Accuracy: 77.14

Precision:  0.7812534365135932

Recall:  0.7714285714285715

F1-Score:  0.771559561409793



In [118]:
confusion_matrix(y_test,y_pred_KNN2)

array([[49,  7,  6],
       [ 6, 63,  3],
       [ 7, 12, 57]], dtype=int64)

In [26]:
with open('KNN3_no.sav', 'wb') as f: #k=3
    pickle.dump(KNN4no, f)

In [239]:
with open('KNN4_full.sav', 'wb') as f: #k=4
    pickle.dump(KNN4, f)


## Naive Bayes

In [32]:
# Naive Bayes
model_NB = ComplementNB() #inisialiasi library
train_X_tfidf = train_X_tfidf.todense() #mengubah representasi menjadi matrix dense atau matrix padat supaya matrixnya tidak nol terlalu banyak
test_X_tfidf = test_X_tfidf.todense()

# Training 
NB2 = model_NB.fit(train_X_tfidf,y_train) #melatih model menggunakan data pelatihan

# Evaluation
y_pred_nb2 = NB2.predict(test_X_tfidf) #melakukan prediksi
y_prob_nb2 = NB2.predict_proba(test_X_tfidf) #melakukan prediksi probabilitas



In [61]:
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score
import time

# Inisialisasi model Naive Bayes
model_NB = ComplementNB()

# Konversi matriks TF-IDF ke matriks dense
train_X_tfidf = train_X_tfidf.todense()
test_X_tfidf = test_X_tfidf.todense()

# Training model
start_time = time.time()
NB2 = model_NB.fit(train_X_tfidf, y_train)
end_time = time.time()

training_time = end_time - start_time

# Evaluasi model
start_time = time.time()
y_pred_nb2 = NB2.predict(test_X_tfidf)
end_time = time.time()

inference_time = end_time - start_time

# Mengukur akurasi
accuracy = accuracy_score(y_test, y_pred_nb2)

print("Akurasi:", accuracy)
print("Waktu Pelatihan:", training_time, "detik")
print("Waktu Prediksi:", inference_time, "detik")


Akurasi: 0.8714285714285714
Waktu Pelatihan: 0.05601978302001953 detik
Waktu Prediksi: 0.007994890213012695 detik




In [83]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model_NB.predict(test_X_tfidf)

CM = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

          -1       0.86      0.82      0.84        62
           0       0.87      0.94      0.91        72
           1       0.86      0.83      0.85        76

    accuracy                           0.87       210
   macro avg       0.87      0.87      0.87       210
weighted avg       0.87      0.87      0.87       210





In [None]:
# Function Accuration NB 1

# acc_NB1 = accuracy_score(df_test['Label'], y_pred_nb)
# precision_NB1 = precision_score(df_test['Label'], y_pred_nb, average='weighted')
# recall_NB1 = recall_score(df_test['Label'], y_pred_nb, average='weighted')
# f1_NB1 = f1_score(df_test['Label'], y_pred_nb, average='weighted')

# # Result
# print("Accuracy: {:.2f}".format(acc_NB1*100),end='\n\n')
# print("Precision: ", precision_NB1,end='\n\n')
# print("Recall: ", recall_NB1,end='\n\n')
# print("F1-Score: ", f1_NB1,end='\n\n')

In [84]:
# Function Accuration NB 2

acc_NB2 = accuracy_score(df_test['Sentiments'], y_pred_nb2)
precision_NB2 = precision_score(df_test['Sentiments'], y_pred_nb2, average='weighted')
recall_NB2 = recall_score(df_test['Sentiments'], y_pred_nb2, average='weighted')
f1_NB2 = f1_score(df_test['Sentiments'], y_pred_nb2, average='weighted')

# Result
print("Accuracy: {:.2f}".format(acc_NB2*100),end='\n\n')
print("Precision: ", precision_NB2,end='\n\n')
print("Recall: ", recall_NB2,end='\n\n')
print("F1-Score: ", f1_NB2,end='\n\n')

Accuracy: 86.67

Precision:  0.8664356771624011

Recall:  0.8666666666666667

F1-Score:  0.8657758056464584



In [85]:
confusion_matrix(y_test,y_pred_nb2)

array([[51,  3,  8],
       [ 2, 68,  2],
       [ 6,  7, 63]], dtype=int64)

In [341]:
# Save to pickle
pickle.dump(NB2,open("NB1_no.sav","wb"))
# with open('NB2_full.sav', 'wb') as f:
#     pickle.dump(NB2, f)

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [28]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time

# Inisialisasi model SVM
model_SVM = SVC()

# Training model SVM
start_time = time.time()
SVM_model = model_SVM.fit(train_X_tfidf, y_train)
end_time = time.time()

training_time = end_time - start_time

# Prediksi dengan model SVM
start_time = time.time()
y_pred_SVM = SVM_model.predict(test_X_tfidf)
end_time = time.time()

inference_time = end_time - start_time

# Mengukur akurasi
accuracy = accuracy_score(y_test, y_pred_SVM)

print("Akurasi:", accuracy)
print("Waktu Pelatihan:", training_time, "detik")
# print("Waktu Prediksi:", inference_time, "detik")






Akurasi: 0.8857142857142857
Waktu Pelatihan: 11.255311965942383 detik
Waktu Prediksi: 2.510164499282837 detik


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Creating  a confusion matrix,which compares the y_test and y_pred
cm = confusion_matrix(df_test['sentiment'], y_pred_nb)

# Creating a dataframe for a array-formatted Confusion matrix,so it will be easy for plotting.
cm_df = pd.DataFrame(cm)

#Plotting the confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.show()

In [None]:
from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, y_pred_svm, target_names=['Positif', 'Netral', 'Negatif']))

In [None]:
print(y_test)