# IMDB영화평 감성 분석

In [22]:
import numpy as np
import pandas as pd

In [23]:
df = pd.read_csv('../00.data/IMDB/labeledTrainData.tsv.zip', delimiter="\t")


In [24]:
df

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


In [25]:
df['review'] = df['review'].str.replace('<br />',' ')

In [26]:
import re

df['review'] = df.review.apply(lambda x: re.sub('[^a-zA-Z]',' ',x))

In [27]:
from sklearn.model_selection import train_test_split

feature_df = df.drop(['id','sentiment'],axis=1,inplace=False)
X_train,X_test,y_train,y_test = train_test_split(
   feature_df,df.sentiment,test_size=0.3,random_state=156
)
X_train.shape,X_test.shape

((17500, 1), (7500, 1))

In [28]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

- CountVectorizer

In [29]:
count_vect = CountVectorizer(stop_words='english',ngram_range=(1,2))
count_vect.fit(X_train.review)
X_train_count = count_vect.transform(X_train.review)
X_test_count = count_vect.transform(X_test.review)


In [30]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_count,y_train)
pred = lr_clf.predict(X_test_count)
accuracy_score(y_test,pred)


0.886

- TfidfVectorizer

In [31]:
tfidf_vect = TfidfVectorizer(stop_words='english',ngram_range=(1,2))
tfidf_vect.fit(X_train.review)
X_train_tfidf = tfidf_vect.transform(X_train.review)
X_test_tfidf = tfidf_vect.transform(X_test.review)

In [32]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf,y_train)
pred = lr_clf.predict(X_test_tfidf)
accuracy_score(y_test,pred)

0.8936

### 모델 저장하고 불러오기

In [33]:
import joblib

joblib.dump(tfidf_vect,'model/imdb_vect.pkl')
joblib.dump(lr_clf,'model/imdb_lr.pkl')

['model/imdb_lr.pkl']

In [34]:
del tfidf_vect
del lr_clf

In [35]:
new_vect = joblib.load('model/imdb_vect.pkl')
new_lr = joblib.load('model/imdb_lr.pkl')

In [36]:
new_X_train = new_vect.transform(X_train.review)
new_X_test = new_vect.transform(X_test.review)

In [37]:
pred = new_lr.predict(new_X_test)
accuracy_score(y_test,pred)

0.8936

### Pipeline을 써서 학습/평가/예측

In [39]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('count_vect',CountVectorizer(stop_words='english',ngram_range=(1,2))),
    ('lr_clf',LogisticRegression(C=10))
])

pipeline.fit(X_train.review,y_train)
pred = pipeline.predict(X_test.review)
acc = accuracy_score(y_test,pred)
print(f'Count Vectorizer + LogisticRegression 정확도 : {acc}')

Count Vectorizer + LogisticRegression 정확도 : 0.886


In [41]:
joblib.dump(pipeline,'model/pipeline.pkl')

['model/pipeline.pkl']

In [42]:
new_pipe = joblib.load('model/pipeline.pkl')

In [43]:
pred = new_pipe.predict(X_test.review)
acc = accuracy_score(y_test,pred)

In [44]:
print(f'Count Vectorizer + LogisticRegression 정확도 : {acc}')

Count Vectorizer + LogisticRegression 정확도 : 0.886
