 # 지도학습 기반 감성분석 - IMDB 영화평
 - 이진분류


In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('labeledTrainData.tsv', sep = '\t')
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
df = pd.read_csv('labeledTrainData.tsv', sep = '\t', quoting=3)  #  = quote_none
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [7]:
df.review[0][:1000]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

## 텍스트 전처리

In [4]:
# str.replace 이용해서 줄바꿈 제거
df.review = df.review.str.replace('<br />', ' ')

- strip([chars]) : 인자로 전달된 문자를 String의 왼쪽과 오른쪽에서 제거합니다.

In [5]:
# 마침표, 숫자 제거 - 영문자가 아닌 글자는 공백으로 전환
# re.sub를 안쓰더라고 DF내에서는 정규표현식이 가능함!
df.review = df.review.str.replace('[^A-Za-z]', ' ',).str.strip()

  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
df.review[0][:1000]

'With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for  

## Train / Test set split

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.review, df.sentiment, stratify = df.sentiment, random_state = 2022
)
y_train.value_counts()

0    9375
1    9375
Name: sentiment, dtype: int64

## CountVectorizer로 변환
- 분리 후 변환임.

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(stop_words = 'english')

In [8]:
# fit과 transform을 따로 해줌
cvec.fit(X_train)
X_train_cv = cvec.transform(X_train)
X_train_cv.shape

(18750, 65213)

In [9]:
# test는 fit을 안해주기 때문에 transform만 적용. = 변환
X_test_cv = cvec.transform(X_test)
X_test_cv.shape

(6250, 65213)

### 분류기 적용 : Naive Bayes

In [13]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [14]:
# %time을 앞에 붙여줘야됨
%time nb.fit(X_train_cv, y_train)

CPU times: user 38.7 ms, sys: 1.03 ms, total: 39.7 ms
Wall time: 60.2 ms


MultinomialNB()

In [16]:
nb.score(X_test_cv, y_test)

0.85504

## ngram_range = (1, 2)

In [17]:
cvec2 = CountVectorizer(ngram_range=(1,2), stop_words='english')
cvec2.fit(X_train)
X_train_cv2 = cvec2.transform(X_train)
print(X_train_cv2.shape)

(18750, 1386558)


In [18]:
X_test_cv2 = cvec2.transform(X_test)

### 분류기 적용

In [19]:
nb2 = MultinomialNB()
%time nb2.fit(X_train_cv2, y_train)

CPU times: user 188 ms, sys: 8.95 ms, total: 197 ms
Wall time: 193 ms


MultinomialNB()

In [20]:
nb2.score(X_test_cv2, y_test)

0.8672

## 모델을 저장하고 불러오기

In [21]:
import joblib

In [23]:
# 로컬 작업시 'model/imdb..~'
joblib.dump(cvec2, 'imdb_cvec12.pkl')
joblib.dump(nb2, 'imdb_nb2.pkl')

['imdb_nb2.pkl']

In [25]:
!ls -l

total 114376
-rw-r--r-- 1 root root 39184702 Apr 28 02:39 imdb_cvec12.pkl
-rw-r--r-- 1 root root 44370573 Apr 28 02:39 imdb_nb2.pkl
-rw-r--r-- 1 root root 33556378 Apr 28 01:33 labeledTrainData.tsv
drwxr-xr-x 1 root root     4096 Apr 25 13:46 sample_data


In [26]:
new_cvec = joblib.load('imdb_cvec12.pkl')
new_nb = joblib.load('imdb_nb2.pkl')

## 실제 데이터로 검증

In [27]:
review = '''I was extremely excited when I heard this movie was being remade.
Though I enjoyed the 80's version I thought with todays' movie magic it could be improved. 
But after seeing the trailers I had my doubts, which were well founded. 
I was glad I hadn't paid money to see this and waited for it to come to HBOmax. 
What Peter Jackson got right with the LOTR movie adaptations this movie gets all wrong. 
The movie is disjointed and confusing. The only reason I understood was because I read the books. 
I had to keep explaining the way things should have gone to my husband who was bored halfway through. 
To make a movie like this you have to be faithful and really understand the important things about the story, the 80's version at least understood that. 
I will probably do the same thing with part 2 wait till its on a streaming service. 
Too bad, really sad about this one.'''

In [28]:
# 텍스트 전처리
import re
review = re.sub('[^A-Za-z]', ' ', review).strip()

In [29]:
# feature 변환 - cvec transform은 리스트여야됨
review_cv = new_cvec.transform([review])

In [30]:
new_nb.predict(review_cv)

array([0])