# IMDB 영화평 감상분석
- 파이프라인
- IFIDvec + Logistic

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('labeledTrainData.tsv', sep = '\t', quoting=3)  #  = quote_none
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


- 텍스트 전처리

In [3]:
df.review = df.review.str.replace('<br />', ' ')

In [4]:
df.review = df.review.str.replace('[^A-Za-z]', ' ',).str.strip()

  """Entry point for launching an IPython kernel.


- train/test split

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.review, df.sentiment, stratify = df.sentiment, random_state = 2022
)
y_train.value_counts()

0    9375
1    9375
Name: sentiment, dtype: int64

### Pipeline 
- pipeline 내에 튜플 형태. (voting과 유사)
- TfidfVectorizer + LogisticRegression

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [8]:
tvec = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
lrc = LogisticRegression(random_state=2022)
pipline = Pipeline([('TVECT',tvec), ('LR', lrc)])

In [10]:
# 학습
%time pipline.fit(X_train, y_train)

CPU times: user 36 s, sys: 15.3 s, total: 51.3 s
Wall time: 34 s


Pipeline(steps=[('TVECT',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('LR', LogisticRegression(random_state=2022))])

In [11]:
pipline.score(X_test, y_test)

0.87472

### 최적 파라미터 찾기 - GridsearchCV

In [12]:
from sklearn.model_selection import GridSearchCV
params = {
    'TVECT__max_df': [100, 500],
    'LR__C' : [1, 10]
}
# 자꾸 Voting 얘기 하는데, 그게 뭐지.?
# 글고 이 표기법은 모지

In [13]:
# n_jobs = -1 : 컴퓨터의 모든 코어를 사용
grid_pipe = GridSearchCV(pipline, params, scoring = 'accuracy', cv = 3 , n_jobs = -1)
%time grid_pipe.fit(X_train, y_train)

CPU times: user 45.4 s, sys: 20.5 s, total: 1min 5s
Wall time: 3min 19s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('TVECT',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('LR',
                                        LogisticRegression(random_state=2022))]),
             n_jobs=-1,
             param_grid={'LR__C': [1, 10], 'TVECT__max_df': [100, 500]},
             scoring='accuracy')

In [14]:
grid_pipe.best_params_

{'LR__C': 10, 'TVECT__max_df': 500}

In [15]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.87552

- 모델 저장하고 불러와서 예측하기

In [16]:
import joblib
joblib.dump(grid_pipe.best_estimator_, 'imdb_tvec_lr.pkl')

['imdb_tvec_lr.pkl']

In [17]:
new_tvec = joblib.load('imdb_tvec_lr.pkl')

In [22]:
review = '''I went to see this film with my anime and Diana Wynne Jones-loving teenage daughter. 
And while I enjoyed the film immensely due to its excellent animation, story and overall sense of fun, it was also interesting to see how much my daughter hated it! 
It was as if we saw two entirely different films. 
Why? Well, she is a huge Diana Wynne Jones fan and has read and re-read just about everything she ever wrote. 
And, according to her, the story was so different and so inferior to the book that she disliked the film and said some very nasty things about director Miyazaki. 
However, my advice is DON'T read the book and just go and see the movie. 
Then, after enjoying it, read the book ONLY if you are 100% sure you can do this without freaking out because they are so different!! 
I didn't know the difference and had a great time seeing the film!'''

In [23]:
import re
review = re.sub('[^A-Za-z]', ' ', review).strip()

In [26]:
[review]

['I went to see this film with my anime and Diana Wynne Jones loving teenage daughter   And while I enjoyed the film immensely due to its excellent animation  story and overall sense of fun  it was also interesting to see how much my daughter hated it   It was as if we saw two entirely different films   Why  Well  she is a huge Diana Wynne Jones fan and has read and re read just about everything she ever wrote   And  according to her  the story was so different and so inferior to the book that she disliked the film and said some very nasty things about director Miyazaki   However  my advice is DON T read the book and just go and see the movie   Then  after enjoying it  read the book ONLY if you are      sure you can do this without freaking out because they are so different    I didn t know the difference and had a great time seeing the film']

In [30]:
new_tvec.predict([review])

array([0])