# 모듈 및 데이터 로딩

In [3]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv('yelp.csv', index_col = 0)

# 독립변수(X)와 종속변수(y) 분리

In [5]:
X = data['text']
y = data['stars']

# Count Vectorizer로 데이터 변환

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
cv = CountVectorizer()

In [8]:
cv.fit(X)

In [9]:
X = cv.transform(X)

# Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

# Naive Bayes 알고리즘으로 모델링

In [12]:
from sklearn.naive_bayes import MultinomialNB

In [13]:
model = MultinomialNB()

In [14]:
model.fit(X_train, y_train)

In [15]:
pred = model.predict(X_test)

# 예측 결과 평가하기 (Naive Bayes)

In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [17]:
accuracy_score(y_test, pred)

0.9265

In [18]:
confusion_matrix(y_test, pred)

array([[ 421,   65],
       [  82, 1432]], dtype=int64)

In [19]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           1       0.84      0.87      0.85       486
           5       0.96      0.95      0.95      1514

    accuracy                           0.93      2000
   macro avg       0.90      0.91      0.90      2000
weighted avg       0.93      0.93      0.93      2000



# Random Forest로 모델링 (비교 목적)

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf = RandomForestClassifier(max_depth = 10, n_estimators = 1000)

In [22]:
rf.fit(X_train, y_train)

In [23]:
pred2 = rf.predict(X_test)

# 예측 결과 평가하기 (Random Forest)

In [24]:
accuracy_score(y_test, pred2)

0.7875

In [25]:
confusion_matrix(y_test, pred2)

array([[  65,  421],
       [   4, 1510]], dtype=int64)

In [26]:
print(classification_report(y_test, pred2))

              precision    recall  f1-score   support

           1       0.94      0.13      0.23       486
           5       0.78      1.00      0.88      1514

    accuracy                           0.79      2000
   macro avg       0.86      0.57      0.56      2000
weighted avg       0.82      0.79      0.72      2000

