In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_csv('../00_source_data/synthetic_data_yelp.csv', encoding='latin-1')

In [3]:
data.head()

Unnamed: 0,label,text
0,1,kind husband tall chair would ok
1,1,even eat tast what white wine cocktail get bot...
2,1,pro friendli lot disgust woman entir time cold...
3,1,first experi ever companynnat first stop look ...
4,1,place went chicken awfulsnnnpleas pleas use fi...


In [4]:
data.dropna(inplace=True)
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2,random_state=42)

# CounterVectizor
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

# training classifier with NB
nb = MultinomialNB()
nb.fit(X_train_cv, y_train)

# print prediction and metrics on training set
y_pred_tr = nb.predict(X_train_cv)
print('Training set')
print('Accuracy: ', accuracy_score(y_train, y_pred_tr))
print('Confusion Matrix: ')
print(confusion_matrix(y_train, y_pred_tr))
print('Classification Report: ')
print(classification_report(y_train, y_pred_tr))
print('=====================================')

# print prediction and metrics on test set
y_pred_te = nb.predict(X_test_cv)
print('Test set')
print('Accuracy: ')
print(accuracy_score(y_test, y_pred_te))
print('Confusion Matrix: ')
print(confusion_matrix(y_test, y_pred_te))
print('Classification Report: ')
print(classification_report(y_test, y_pred_te))

Training set
Accuracy:  0.94275
Confusion Matrix: 
[[3816  172]
 [ 286 3726]]
Classification Report: 
              precision    recall  f1-score   support

           1       0.93      0.96      0.94      3988
           2       0.96      0.93      0.94      4012

    accuracy                           0.94      8000
   macro avg       0.94      0.94      0.94      8000
weighted avg       0.94      0.94      0.94      8000

Test set
Accuracy: 
0.9015
Confusion Matrix: 
[[939  73]
 [124 864]]
Classification Report: 
              precision    recall  f1-score   support

           1       0.88      0.93      0.91      1012
           2       0.92      0.87      0.90       988

    accuracy                           0.90      2000
   macro avg       0.90      0.90      0.90      2000
weighted avg       0.90      0.90      0.90      2000

