In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report

In [7]:
# Load dataset
def load_data():
    data = pd.read_csv("digital_korlantas_reviews_5000_p.csv")
    return data

df = load_data()
df = df[["content_clean","sentiment"]]
df.head()

Unnamed: 0,content_clean,sentiment
0,mantap,Positif
1,moga,Positif
2,ok,Positif
3,bantu buat sim online,Positif
4,alhamdulillah sim darat selamat sampe rumah te...,Positif


In [8]:
df = df.astype({"content_clean":"string","sentiment":"category"})
df.dtypes

content_clean      string
sentiment        category
dtype: object

In [9]:
tf = TfidfVectorizer()
text_tf=tf.fit_transform(df["content_clean"].astype("U"))
text_tf

<2632x3436 sparse matrix of type '<class 'numpy.float64'>'
	with 25143 stored elements in Compressed Sparse Row format>

In [10]:
X_train, X_test, y_train, y_test = train_test_split(text_tf,df["sentiment"], test_size=0.2, random_state=42)

In [15]:
print(f"Positif class at train dataset {sum(y_train=='Positif')}")
print(f"Positif class at train dataset {sum(y_train=='Negatif')}")
print(f"Positif class at test dataset {sum(y_test=='Positif')}")
print(f"Positif class at test dataset {sum(y_test=='Negatif')}")


Positif class at train dataset 1038
Positif class at train dataset 1067
Positif class at test dataset 270
Positif class at test dataset 257


In [16]:
model = MultinomialNB().fit(X_train,y_train)
predicted = model.predict(X_test)
print("acc",accuracy_score(y_test,predicted))
print("prec",precision_score(y_test,predicted,average="binary", pos_label="Negatif"))
print("rec",recall_score(y_test,predicted,average="binary", pos_label="Negatif"))
print("f1",f1_score(y_test,predicted,average="binary", pos_label="Negatif"))
print(confusion_matrix(y_test,predicted))
print(classification_report(y_test, predicted, zero_division=0))

acc 0.793168880455408
prec 0.7681159420289855
rec 0.8249027237354085
f1 0.7954971857410882
[[212  45]
 [ 64 206]]
              precision    recall  f1-score   support

     Negatif       0.77      0.82      0.80       257
     Positif       0.82      0.76      0.79       270

    accuracy                           0.79       527
   macro avg       0.79      0.79      0.79       527
weighted avg       0.80      0.79      0.79       527



In [18]:
kf = KFold(n_splits=10)
X_4kfold = text_tf.toarray()
Y_4kfold  = df["sentiment"].copy()
def cross_val(estimator):
    acc = []
    prec = []
    rec = []
    f1 = []
    for train_index, test_index in kf.split(X_4kfold,Y_4kfold):
        X_train, X_test = X_4kfold[train_index], X_4kfold[test_index]
        y_train, y_test = Y_4kfold[train_index], Y_4kfold[test_index]
        model = estimator.fit(X_train,y_train)
        predicted = model.predict(X_test)
        acc.append(accuracy_score(y_test,predicted))
        prec.append(precision_score(y_test,predicted,average="binary", pos_label="Negatif"))
        rec.append(recall_score(y_test,predicted,average="binary", pos_label="Negatif"))
        f1.append(f1_score(y_test,predicted,average="binary", pos_label="Negatif"))
        
        print("acc",accuracy_score(y_test,predicted))
        print("prec",precision_score(y_test,predicted,average="binary", pos_label="Negatif"))
        print("rec",recall_score(y_test,predicted,average="binary", pos_label="Negatif"))
        print("f1",f1_score(y_test,predicted,average="binary", pos_label="Negatif"))
        print(confusion_matrix(y_test,predicted))
        print(classification_report(y_test, predicted, zero_division=0))
        print("==================================================================")
    
    print("Average========================================================")
    print(f"acc : {np.mean(acc)}")
    print(f"prec : {np.mean(prec)}")
    print(f"rec : {np.mean(rec)}")
    print(f"f1 : {np.mean(f1)}")

model = MultinomialNB()
cross_val(model)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


acc 0.678030303030303
prec 0.0
rec 0.0
f1 0.0
[[  0   0]
 [ 85 179]]
              precision    recall  f1-score   support

     Negatif       0.00      0.00      0.00         0
     Positif       1.00      0.68      0.81       264

    accuracy                           0.68       264
   macro avg       0.50      0.34      0.40       264
weighted avg       1.00      0.68      0.81       264

acc 0.6553030303030303
prec 0.0
rec 0.0
f1 0.0
[[  0   0]
 [ 91 173]]
              precision    recall  f1-score   support

     Negatif       0.00      0.00      0.00         0
     Positif       1.00      0.66      0.79       264

    accuracy                           0.66       264
   macro avg       0.50      0.33      0.40       264
weighted avg       1.00      0.66      0.79       264

acc 0.6273764258555133
prec 0.0
rec 0.0
f1 0.0
[[  0   0]
 [ 98 165]]
              precision    recall  f1-score   support

     Negatif       0.00      0.00      0.00         0
     Positif       1.00     

  _warn_prf(average, modifier, msg_start, len(result))


acc 0.596958174904943
prec 0.0
rec 0.0
f1 0.0
[[  0   0]
 [106 157]]
              precision    recall  f1-score   support

     Negatif       0.00      0.00      0.00         0
     Positif       1.00      0.60      0.75       263

    accuracy                           0.60       263
   macro avg       0.50      0.30      0.37       263
weighted avg       1.00      0.60      0.75       263

acc 0.8288973384030418
prec 0.14
rec 0.7777777777777778
f1 0.23728813559322035
[[  7   2]
 [ 43 211]]
              precision    recall  f1-score   support

     Negatif       0.14      0.78      0.24         9
     Positif       0.99      0.83      0.90       254

    accuracy                           0.83       263
   macro avg       0.57      0.80      0.57       263
weighted avg       0.96      0.83      0.88       263

acc 0.7224334600760456
prec 1.0
rec 0.7224334600760456
f1 0.8388520971302428
[[190  73]
 [  0   0]]
              precision    recall  f1-score   support

     Negatif       1