# 모듈 및 데이터 로딩

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('yelp.csv', index_col = 0)

# 독립변수(X)와 종속변수(y) 분리

In [3]:
X = data['text']
y = data['stars']

# Count Vectorizer로 데이터 변환

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
cv = CountVectorizer()

In [6]:
cv.fit(X)

In [7]:
# X = cv.fit_transform(X)  # fit과 transform을 한 줄로
X = cv.transform(X)

In [30]:
print(X)

  (0, 1423)	1
  (0, 1453)	1
  (0, 1580)	1
  (0, 1905)	2
  (0, 2027)	2
  (0, 3066)	1
  (0, 3694)	1
  (0, 4157)	1
  (0, 4938)	1
  (0, 4975)	1
  (0, 5112)	1
  (0, 6921)	1
  (0, 7535)	1
  (0, 9746)	2
  (0, 10103)	2
  (0, 10370)	1
  (0, 10440)	1
  (0, 11315)	1
  (0, 12016)	1
  (0, 12805)	1
  (0, 13373)	1
  (0, 15023)	1
  (0, 15210)	1
  (0, 15742)	1
  (0, 15791)	1
  :	:
  (9998, 27679)	3
  (9998, 27786)	1
  (9998, 27818)	1
  (9998, 27974)	2
  (9998, 28038)	3
  (9999, 1580)	1
  (9999, 2380)	3
  (9999, 2611)	1
  (9999, 2857)	1
  (9999, 8012)	1
  (9999, 9067)	1
  (9999, 10081)	1
  (9999, 10103)	1
  (9999, 10417)	1
  (9999, 11067)	1
  (9999, 14879)	1
  (9999, 16451)	1
  (9999, 16901)	1
  (9999, 21795)	1
  (9999, 22108)	1
  (9999, 22260)	1
  (9999, 25145)	1
  (9999, 26879)	1
  (9999, 27679)	1
  (9999, 28048)	1


In [31]:
cv.get_feature_names_out()[800]

'abuses'

# Train Test Split

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

# Naive Bayes 알고리즘으로 모델링

In [34]:
from sklearn.naive_bayes import MultinomialNB

In [35]:
model = MultinomialNB()

In [36]:
model.fit(X_train, y_train)

In [37]:
pred = model.predict(X_test)

# 예측 결과 평가하기 (Naive Bayes)

In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [39]:
accuracy_score(y_test, pred)

0.9265

In [40]:
confusion_matrix(y_test, pred)

array([[ 421,   65],
       [  82, 1432]], dtype=int64)

In [41]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           1       0.84      0.87      0.85       486
           5       0.96      0.95      0.95      1514

    accuracy                           0.93      2000
   macro avg       0.90      0.91      0.90      2000
weighted avg       0.93      0.93      0.93      2000



# Random Forest로 모델링 (비교 목적)

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf = RandomForestClassifier(max_depth = 10, n_estimators = 1000)

In [22]:
rf.fit(X_train, y_train)

In [23]:
pred2 = rf.predict(X_test)

# 예측 결과 평가하기 (Random Forest)

In [24]:
accuracy_score(y_test, pred2)

0.7875

In [25]:
confusion_matrix(y_test, pred2)

array([[  65,  421],
       [   4, 1510]], dtype=int64)

In [26]:
print(classification_report(y_test, pred2))

              precision    recall  f1-score   support

           1       0.94      0.13      0.23       486
           5       0.78      1.00      0.88      1514

    accuracy                           0.79      2000
   macro avg       0.86      0.57      0.56      2000
weighted avg       0.82      0.79      0.72      2000

