In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Load dataset
def load_data():
    data = pd.read_csv("digital_korlantas_reviews_5000_preprocessed.csv")
    return data

df = load_data()
df = df[["content_preprocessed","sentiment"]] #hanya mengambil kolom content_preprocessed dan sentiment 
df.head()

Unnamed: 0,content_preprocessed,sentiment
0,mantap,Positif
1,moga,Positif
2,ok,Positif
3,bantu buat sim online,Positif
4,alhamdulillah sim darat selamat rumah terima k...,Positif


In [3]:
# menetapkan kolom sentiment sebagai kolom betipe kategori
# menetapkan kolom content_preprocessed sebagai kolom betipe string
df = df.astype({"content_preprocessed":"string","sentiment":"category"})
df.dtypes

content_preprocessed      string
sentiment               category
dtype: object

In [4]:
# proses TF-IDF Vectorizer, setiap data yang pada awal nya bertipe string menjadi vektor dan dilakukan pembobotan
tf = TfidfVectorizer()
text_tf=tf.fit_transform(df["content_preprocessed"].astype("U"))
text_tf

<2630x2778 sparse matrix of type '<class 'numpy.float64'>'
	with 22466 stored elements in Compressed Sparse Row format>

In [5]:
# pembagian subdataset menjadi bagian train dan test
X_train, X_test, y_train, y_test = train_test_split(text_tf,df["sentiment"], test_size=0.2, random_state=14)

In [6]:
# jumlah kategori/class pada setiap subdataset
print(f"Positif class at train dataset {sum(y_train=='Positif')}")
print(f"Positif class at train dataset {sum(y_train=='Negatif')}")
print(f"Positif class at test dataset {sum(y_test=='Positif')}")
print(f"Positif class at test dataset {sum(y_test=='Negatif')}")


Positif class at train dataset 1039
Positif class at train dataset 1065
Positif class at test dataset 268
Positif class at test dataset 258


In [7]:
# pengujian performa algoritma Naive Bayes dengan accuracy, preci
model = MultinomialNB().fit(X_train,y_train)
predicted = model.predict(X_test)
print("accuracy",accuracy_score(y_test,predicted))
print("precision",precision_score(y_test,predicted,average="binary", pos_label="Negatif"))
print("recall",recall_score(y_test,predicted,average="binary", pos_label="Negatif"))
print("f1 score",f1_score(y_test,predicted,average="binary", pos_label="Negatif"))
print(confusion_matrix(y_test,predicted))
print(classification_report(y_test, predicted, zero_division=0))

accuracy 0.8003802281368821
precision 0.7703180212014135
recall 0.8449612403100775
f1 score 0.8059149722735675
[[218  40]
 [ 65 203]]
              precision    recall  f1-score   support

     Negatif       0.77      0.84      0.81       258
     Positif       0.84      0.76      0.79       268

    accuracy                           0.80       526
   macro avg       0.80      0.80      0.80       526
weighted avg       0.80      0.80      0.80       526



In [9]:
# kf = KFold(n_splits=10)
# X_4kfold = text_tf.toarray()
# Y_4kfold  = df["sentiment"].copy()
# def cross_val(estimator):
#     acc = []
#     prec = []
#     rec = []
#     f1 = []
#     for train_index, test_index in kf.split(X_4kfold,Y_4kfold):
#         X_train, X_test = X_4kfold[train_index], X_4kfold[test_index]
#         y_train, y_test = Y_4kfold[train_index], Y_4kfold[test_index]
#         model = estimator.fit(X_train,y_train)
#         predicted = model.predict(X_test)
#         acc.append(accuracy_score(y_test,predicted))
#         prec.append(precision_score(y_test,predicted,average="binary", pos_label="Negatif"))
#         rec.append(recall_score(y_test,predicted,average="binary", pos_label="Negatif"))
#         f1.append(f1_score(y_test,predicted,average="binary", pos_label="Negatif"))
        
#         print("acc",accuracy_score(y_test,predicted))
#         print("prec",precision_score(y_test,predicted,average="binary", pos_label="Negatif"))
#         print("rec",recall_score(y_test,predicted,average="binary", pos_label="Negatif"))
#         print("f1",f1_score(y_test,predicted,average="binary", pos_label="Negatif"))
#         print(confusion_matrix(y_test,predicted))
#         print(classification_report(y_test, predicted, zero_division=0))
#         print("==================================================================")
    
#     print("Average========================================================")
#     print(f"accuracy : {np.mean(acc)}")
#     print(f"precision : {np.mean(prec)}")
#     print(f"recall : {np.mean(rec)}")
#     print(f"f1 score : {np.mean(f1)}")

# model = MultinomialNB()
# cross_val(model)