Sourced by: http://ai.stanford.edu/~amaas/data/sentiment/

## Make dataframe

In [None]:
import pyprind
import os
import warnings
warnings.filterwarnings("ignore")

In [None]:
basepath = "aclImdb"

In [None]:
labels = {"pos": 1, "neg": 0}
pbar = pyprind.ProgBar(50000)

In [None]:
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file),
                     'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]],
                          ignore_index=True)
            pbar.update()
            
df.columns = ['review', 'sentiment']

In [None]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
# df.to_csv("movie_data.csv", index=False, encoding='utf-8')

In [1]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
print(df.shape)
df.tail()

(50000, 2)


Unnamed: 0,review,sentiment
49995,"Towards the end of the movie, I felt it was to...",0
49996,This is the kind of movie that my enemies cont...,0
49997,I saw 'Descent' last night at the Stockholm Fi...,0
49998,Some films that you pick up for a pound turn o...,0
49999,"This is one of the dumbest films, I've ever se...",0


---

## Preprocessing

- Python 정규표현식을 사용하여 HTML markup 문자 제거
- Sentiment Analysis에 도움이 될 이모티콘은 문장 마지막에 따로 추가
- 문자를 모두 소문자로 바꾸고 특수문자들 제거

In [2]:
df.loc[41556, 'review'][-50:]

"an this, and that wasn't even so good.<br /><br />"

In [3]:
import re

In [4]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
#     print(emoticons)
    text = (re.sub('[\W]+', ' ', text.lower()) + 
                  ' '.join(emoticons).replace('-', '')) # 이모티콘에서 ":-)" 중 '-' 제외
    return text

In [5]:
# Test
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [6]:
df['review'] = df['review'].apply(preprocessor)

## EDA
- EDA 과정의 일환으로 Scikit-learn에 구현된 `LatentDirichletAllocation` 클래스를 사용하여 영화 리뷰를 여러 개의 topic으로 분류하는 작업을 해보도록 하겠습니다.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

단어의 최대 빈도를 10%로 설정해서 너무 자주 등장하는 단어는 제외했습니다. 모든 문서에 걸쳐 자주 등장하는 단어는 topic 카테고리와 관련성이 적을 수 있다고 생각했습니다. 그리고 데이터셋의 차원을 조절하기 위해서 5,000개로 단어의 수를 제한했습니다.

In [36]:
stop = stopwords.words('english')
count = CountVectorizer(stop_words=stop,
                        max_df=0.1,
                        max_features=4000)

X = count.fit_transform(df['review'].values)

In [37]:
from sklearn.decomposition import LatentDirichletAllocation

In [38]:
%%time
lda = LatentDirichletAllocation(n_components=10,
                                random_state=41,
                                learning_method='batch')
topics = lda.fit_transform(X)

CPU times: user 4min 39s, sys: 301 ms, total: 4min 39s
Wall time: 4min 39s


In [39]:
topics.shape

(50000, 10)

In [40]:
n_top_words = 6
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("토픽 {}".format(topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))

토픽 1
action fight shot car guy guys
토픽 2
game video horror zombie dead house
토픽 3
john role plays played james performance
토픽 4
horror killer blood gore ending girl
토픽 5
series war book episode tv read
토픽 6
music comedy fun wonderful song songs
토픽 7
family father mother woman wife son
토픽 8
worst script original budget effects awful
토픽 9
cinema art audience human beautiful style
토픽 10
guy comedy kids stupid laugh jokes


In [41]:
movie = topics[:, 5].argsort()[::-1]
for iter_idx, moive_idx in enumerate(movie[:10]):
    print("\n영화 #{}:".format(iter_idx + 1))
    print(df['review'][moive_idx][:500], '...')


영화 #1:
had i been familiar with the stage production of guys and dolls before seeing the movie i might not be as fond of it as i am although in all fairness i would probably still like the film production better because of my general adoration of both brando for his acting and sinatra for his voice although he is quite the actor as well see the manchurian candidate or from here to eternity as for some of the other reviewers statements about the songs i have the broadway soundtrack and though isabel bin ...

영화 #2:
a give this movie a 10 10 because it deserves a 10 10 two of the best actors of their time walter matthau george burns collaborate with neil simon and all of the other actors that are in this film director herbert ross and all of that makes this stage adaption come true the sunshine boys is one of the best films of the 70 s i love the type of humor in this film it just makes me laugh so hard i got this movie on vhs 3 days ago yes vhs because it was cheaper only 3 i watched i

---

## Model training

In [None]:
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.review, df.sentiment,
                                                    test_size=0.33)

In [None]:
def tokenizer(text): 
    return text.split()
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tfidf = TfidfVectorizer(lowercase=False)

In [None]:
param_gird = [
              {'vect__ngram_range': [(1, 2)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 2)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf': [False],
               'vect__norm': [None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}
             ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(solver='liblinear', random_state=0))])

In [None]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_gird,
                           scoring='accuracy',
                           cv=5, verbose=1)

In [None]:
%%time
gs_lr_tfidf.fit(X_train, y_train)

In [None]:
from sklearn.externals import joblib
joblib.dump(gs_lr_tfidf, 'gs_lr_tfidf.pkl')