## Data Preparation

In [23]:
import pandas as pd
import numpy as np
import nltk
import pickle

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import naive_bayes

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akmal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
df = pd.read_csv('../dataset/reviews.csv', sep='\t', names=['Reviews', 'Comments'])
df.head()

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


## Create Model

In [26]:
stopset = stopwords.words('english')

In [27]:
# Proses transformasi data
vectorized = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [28]:
# Split dataset ke dalam model training dan testing
X = vectorized.fit_transform(df['Comments'])
y = df['Reviews']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [29]:
# Naive bayes classifier
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

In [30]:
# Informasi model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.97       738
           1       0.97      0.98      0.98       992

    accuracy                           0.97      1730
   macro avg       0.97      0.97      0.97      1730
weighted avg       0.97      0.97      0.97      1730



## Testing Model

In [31]:
review = 'awesome, cool, nice, awesome'
review_vector = vectorized.transform([review])
clf.predict(review_vector)

array([1], dtype=int64)

In [32]:
# Persentase kebagusan dari review
prob = clf.predict_proba(review_vector)
print("Persentase: ", prob[0][1])

Persentase:  0.8440326309391087


In [33]:
# Proses penyimpanan model
data = {
    'vectorized': vectorized,
    'clf': clf
}

pickle.dump(data, open('../model/sentiment.pkl', 'wb'))