# IMDB 영화평 분류 모델 생성

In [6]:
import numpy as np 
import pandas as pd 

### 데이터 전처리

In [7]:
df = pd.read_csv('../static/data/IMDB/labeledTrainData.tsv',
                    header=0, sep='\t', quoting=3)
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


- null값이나 공백 값을 가지고 있는지 확인

In [8]:
df.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

In [9]:
df[df.review == ''].count()

id           0
sentiment    0
review       0
dtype: int64

In [10]:
df.review[0][:1000]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [11]:
# <br /> 태그는 공백으로 변환
df['review'] = df.review.str.replace('<br />', ' ')

In [12]:
# 영어 이외의 문자는 공백으로 변환
import re

df['review'] = df.review.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

### train / test 데이터셋 분리

In [13]:
from sklearn.model_selection import train_test_split

feature_df = df.drop(['id', 'sentiment'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(
    feature_df, df.sentiment, test_size=0.25, random_state=2021
)
X_train.shape, X_test.shape

((18750, 1), (6250, 1))

In [14]:
X_train[:3]

Unnamed: 0,review
14475,There s no shortage of bad dialogue in David ...
22605,This film takes what could have been a good i...
17673,Bob Clampett s Porky s Poor Fish is a so so...


In [15]:
X_test[:3]

Unnamed: 0,review
13895,I was expecting a B Movie French musical Aft...
20903,Disappearance is about a couple who take thei...
8539,I noticed at once that this movie really wasn...


In [16]:
df_test = pd.DataFrame(X_test, columns=['review'])
df_test['sentiment'] = y_test

In [17]:
df_test.to_csv('../static/data/IMDB_test.csv', index=False)

In [18]:
df_test = pd.read_csv('../static/data/IMDB_test.csv')
df_test.head()

Unnamed: 0,review,sentiment
0,I was expecting a B Movie French musical Aft...,0
1,Disappearance is about a couple who take thei...,0
2,I noticed at once that this movie really wasn...,1
3,Friday the th step over There is officiall...,0
4,A delightful piece of cinema storytelling in ...,1


In [69]:
df_test.shape

(6250, 2)

### 학습 / 예측 / 평가

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import joblib

#### 1) CountVectorizer + LogisticRegression

In [18]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression())
])
params = ({
    'count_vect__max_df': [100, 300, 500],
    'lr_clf__C': [1, 5, 10]
})

In [19]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(f'최고 평균 정확도: {grid_pipe.best_score_:.4f}')
print('최적 파라미터:', grid_pipe.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  3.7min finished
최고 평균 정확도: 0.8667
최적 파라미터: {'count_vect__max_df': 500, 'lr_clf__C': 1}


In [20]:
best_count_lr = grid_pipe.best_estimator_
pred_count_lr = best_count_lr.predict(df_test.review.values)
acc = accuracy_score(df_test.sentiment.values, pred_count_lr)
print(f'CountVectorizer + LogisticRegression 정확도: {acc:.4f}')

CountVectorizer + LogisticRegression 정확도: 0.8734


In [21]:
joblib.dump(best_count_lr, '../static/model/imdb_count_lr.pkl')

['../static/model/imdb_count_lr.pkl']

#### 2) TfidfVectorizer + LogisticRegression

In [22]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression())
])
params = ({
    'tfidf_vect__max_df': [100, 300, 500],
    'lr_clf__C': [1, 5, 10]
})

In [23]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(f'최고 평균 정확도: {grid_pipe.best_score_:.4f}')
print('최적 파라미터:', grid_pipe.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  2.4min finished
최고 평균 정확도: 0.8777
최적 파라미터: {'lr_clf__C': 10, 'tfidf_vect__max_df': 500}


In [24]:
best_tfidf_lr = grid_pipe.best_estimator_
pred_tfidf_lr = best_tfidf_lr.predict(df_test.review.values)
acc = accuracy_score(df_test.sentiment.values, pred_tfidf_lr)
print(f'Tfidf Vectorizer + Logistic Regression 정확도: {acc:.4f}')

Tfidf Vectorizer + Logistic Regression 정확도: 0.8814


In [25]:
joblib.dump(best_tfidf_lr, '../static/model/imdb_tfidf_lr.pkl')

['../static/model/imdb_tfidf_lr.pkl']

#### 3) TfidfVectorizer + SVM

In [20]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('sv_clf', SVC())
])

In [21]:
%time pipeline.fit(X_train.review, y_train)

Wall time: 17min 49s


Pipeline(steps=[('tfidf_vect',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('sv_clf', SVC())])

In [22]:
%time pred_tfidf_sv = pipeline.predict(df_test.review.values)
acc = accuracy_score(y_test, pred_tfidf_sv)
print(f'Tfidf Vectorizer + SVM 정확도: {acc:.4f}')

Wall time: 2min 7s
Tfidf Vectorizer + SVM 정확도: 0.8832


In [23]:
joblib.dump(pipeline, '../static/model/imdb_tfidf_sv.pkl')

['../static/model/imdb_tfidf_sv.pkl']

### 테스트

In [28]:
model_cl = joblib.load('../static/model/imdb_count_lr.pkl')
model_tl = joblib.load('../static/model/imdb_tfidf_lr.pkl')
model_ts = joblib.load('../static/model/imdb_tfidf_sv.pkl')

- 인덱스를 입력받을 때

In [58]:
index = 1000

In [59]:
test_data = []
test_data.append(df_test.iloc[index, 0])

In [60]:
test_data

[' i say the domino principle is an enormously underappreciated film anyone who has taken the time to investigate our contemporary history of conspiracies jfk  rfk  mlk g wallace and in fact numerous others can only draw the conclusion that the author of the domino principle really knew what he was talking about roy tucker could be lee harvey oswald or james earl ray or sirhan sirhan or arthur bremer maybe even john hinkley or timothy mcveigh to mention a few the conspiracy scenario involving spies  big business and political assassinations is not really a fiction but an ominous part of our convoluted existential history god help us but the domino principle is more fact than fantasy if this causes a little loss of sleep  maybe it should don t take my word for it investigate for yourselves  ']

In [61]:
label = df_test.sentiment[index]
label

1

In [62]:
pred_cl = model_cl.predict(test_data)
pred_tl = model_tl.predict(test_data)
pred_ts = model_ts.predict(test_data)

In [63]:
label, pred_cl[0], pred_tl[0], pred_ts[0]

(1, 1, 1, 1)

- 리뷰를 직접 입력받을 때

In [64]:
review_string = '''After reading all the rave reviews for this movie, it turned out to be a real disappointment. Can anyone really believe that the prisoners are honourable men while the guards and warden are all "crooks"?? Haven't we overdone the hypocritical "Bible thumper"?? -- let's find a more enlightened characterization. James Bond, himself, could not have followed the main character's escape plan. The ending soliloquy and denouement were inferior plagiarisms from "Cool Hand Luke". Enough!!! Unfortunately, fine performances by both Robbins and Freeman were wasted in this "fantasy" story.'''

In [65]:
# 리뷰를 직접 입력받을 때
test_data = []
test_data.append(review_string)

In [66]:
test_data

['After reading all the rave reviews for this movie, it turned out to be a real disappointment. Can anyone really believe that the prisoners are honourable men while the guards and warden are all "crooks"?? Haven\'t we overdone the hypocritical "Bible thumper"?? -- let\'s find a more enlightened characterization. James Bond, himself, could not have followed the main character\'s escape plan. The ending soliloquy and denouement were inferior plagiarisms from "Cool Hand Luke". Enough!!! Unfortunately, fine performances by both Robbins and Freeman were wasted in this "fantasy" story.']

In [67]:
pred_cl = model_cl.predict(test_data)
pred_tl = model_tl.predict(test_data)
pred_ts = model_ts.predict(test_data)

In [68]:
# 직접 입력
pred_cl[0], pred_tl[0], pred_ts[0]

(0, 0, 0)