In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report

In [5]:
# Load dataset
def load_data():
    data = pd.read_csv("Data/digital_korlantas_reviews_cleaned.csv")
    return data

df = load_data()
df = df[["content_clean","Sentiment"]]
df.head()

Unnamed: 0,content_clean,Sentiment
0,terima kasih,Positif
1,panjang sim tp bkan nama yg daftar aplikasi,Positif
2,nanya kalo sim udan mati th panjang,Positif
3,banding polres kacau,Negatif
4,mohon maaf gagal verif ktp yaa,Negatif


In [20]:
df = df.astype({"content_clean":"string","Sentiment":"category"})
df.dtypes

content_clean      string
Sentiment        category
dtype: object

In [28]:
tf = TfidfVectorizer()
text_tf=tf.fit_transform(df["content_clean"].astype("U"))
text_tf

<500x1218 sparse matrix of type '<class 'numpy.float64'>'
	with 4511 stored elements in Compressed Sparse Row format>

In [30]:
X_train, X_test, y_train, y_test = train_test_split(text_tf,df["Sentiment"], test_size=0.2, random_state=42)

In [47]:
model = MultinomialNB().fit(X_train,y_train)
predicted = model.predict(X_test)
print("acc",accuracy_score(y_test,predicted))
print("prec",precision_score(y_test,predicted,average="micro"))
print("rec",recall_score(y_test,predicted,average="micro"))
print("f1",f1_score(y_test,predicted,average="micro"))
print(confusion_matrix(y_test,predicted))
print(classification_report(y_test, predicted, zero_division=0))

acc 0.67
prec 0.67
rec 0.67
f1 0.67
[[ 3  0 19]
 [ 0  0 14]
 [ 0  0 64]]
              precision    recall  f1-score   support

     Negatif       1.00      0.14      0.24        22
      Netral       0.00      0.00      0.00        14
     Positif       0.66      1.00      0.80        64

    accuracy                           0.67       100
   macro avg       0.55      0.38      0.35       100
weighted avg       0.64      0.67      0.56       100



In [57]:
kf = KFold(n_splits=10)
X_4kfold = text_tf.toarray()
Y_4kfold  = df["Sentiment"].copy()
def cross_val(estimator):
    acc = []
    prec = []
    rec = []
    for train_index, test_index in kf.split(X_4kfold,Y_4kfold):
        X_train, X_test = X_4kfold[train_index], X_4kfold[test_index]
        y_train, y_test = Y_4kfold[train_index], Y_4kfold[test_index]
        model = estimator.fit(X_train,y_train)
        predicted = model.predict(X_test)
        acc.append(accuracy_score(y_test,predicted))
        prec.append(precision_score(y_test,predicted,average="micro"))
        rec.append(recall_score(y_test,predicted,average="micro"))
        
        print("acc",accuracy_score(y_test,predicted))
        print("prec",precision_score(y_test,predicted,average="micro"))
        print("rec",recall_score(y_test,predicted,average="micro"))
        print("f1",f1_score(y_test,predicted,average="micro"))
        print(confusion_matrix(y_test,predicted))
        print(classification_report(y_test, predicted, zero_division=0))
        print("==================================================================")
    
    print("Average========================================================")
    print(f"acc : {np.mean(acc)}")
    print(f"prec : {np.mean(prec)}")
    print(f"rec : {np.mean(rec)}")

model = MultinomialNB()
cross_val(model)

acc 0.58
prec 0.58
rec 0.58
f1 0.58
[[ 0  0 14]
 [ 0  0  7]
 [ 0  0 29]]
              precision    recall  f1-score   support

     Negatif       0.00      0.00      0.00        14
      Netral       0.00      0.00      0.00         7
     Positif       0.58      1.00      0.73        29

    accuracy                           0.58        50
   macro avg       0.19      0.33      0.24        50
weighted avg       0.34      0.58      0.43        50

acc 0.58
prec 0.58
rec 0.58
f1 0.58
[[ 1  0 13]
 [ 0  0  8]
 [ 0  0 28]]
              precision    recall  f1-score   support

     Negatif       1.00      0.07      0.13        14
      Netral       0.00      0.00      0.00         8
     Positif       0.57      1.00      0.73        28

    accuracy                           0.58        50
   macro avg       0.52      0.36      0.29        50
weighted avg       0.60      0.58      0.44        50

acc 0.56
prec 0.56
rec 0.56
f1 0.56
[[ 0  0 15]
 [ 0  0  7]
 [ 0  0 28]]
              preci