In [74]:
import gensim
from gensim.models import Doc2Vec

import numpy as np
from random import shuffle
from sklearn.linear_model import LogisticRegression

In [13]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

import multiprocessing
import os

## 실습 4-3-1. 생성한 Doc2Vec 모델을 활용하여, 2000개의 검증 데이터에 대한 감성분석을 수행한다
- 로지스틱 회귀 분류기(logistic regression classifier)를 활용하여 각 리뷰를 긍정/부정으로 분류한다
- 기계학습 알고리즘의 변수(feature)로는 Doc2Vec의 결과로 생성된 3000차원의 배열(array)를 활용한다

In [4]:
stop_words = set(stopwords.words('english'))
lemm = WordNetLemmatizer()

In [7]:
LabeledSentence = gensim.models.doc2vec.LabeledSentence

class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
       self.labels_list = labels_list
       self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield LabeledSentence(words=str(doc).split(),tags=[self.labels_list[idx]])

In [37]:
review_list = []
labels_list = []

In [38]:
files = os.listdir('aclImdb/train/pos')[:1000]
for file in files:
    review = ''
    with open('aclImdb/train/pos/{}'.format(file), 'r', encoding = 'utf-8') as f:
        for word in word_tokenize(f.read()):
            if lemm.lemmatize(word) not in stop_words:
                review += ' ' + word
        f.close()
    review_list.append(review)
    labels_list.append('pos_' + file)

In [39]:
files = os.listdir('aclImdb/train/neg')[:1000]
for file in files:
    review = ''
    with open('aclImdb/train/neg/{}'.format(file), 'r', encoding = 'utf-8') as f:
        for word in word_tokenize(f.read()):
            if lemm.lemmatize(word) not in stop_words:
                review += ' ' + word
        f.close()
    review_list.append(review)
    labels_list.append('neg_' + file)

In [40]:
files = os.listdir('aclImdb/test/pos')[:1000]
for file in files:
    review = ''
    with open('aclImdb/test/pos/{}'.format(file), 'r', encoding = 'utf-8') as f:
        for word in word_tokenize(f.read()):
            if lemm.lemmatize(word) not in stop_words:
                review += ' ' + word
        f.close()
    review_list.append(review)
    labels_list.append('pos_' + file)

In [42]:
files = os.listdir('aclImdb/test/neg')[:1000]
for file in files:
    review = ''
    with open('aclImdb/test/neg/{}'.format(file), 'r', encoding = 'utf-8') as f:
        for word in word_tokenize(f.read()):
            if lemm.lemmatize(word) not in stop_words:
                review += ' ' + word
        f.close()
    review_list.append(review)
    labels_list.append('neg_' + file)

In [43]:
it = LabeledLineSentence(doc_list = review_list, labels_list = labels_list)

In [44]:
model = Doc2Vec(size = 3000, window = 10, dm = 0, alpha=0.025, min_alpha=0.025, min_count=5, workers = multiprocessing.cpu_count())

model.build_vocab(it)
model.train(it, total_examples = 4000, epochs = 20)

model.save('partial_Doc2Vec.model')

In [54]:
model = Doc2Vec.load('partial_Doc2Vec.model')

In [55]:
model.docvecs['pos_0_9.txt']

array([ 0.16584291,  0.17257604,  0.11534312, ...,  0.12235802,
       -0.14331858, -0.09695759], dtype=float32)

In [66]:
x_train = np.zeros((2000, 3000))
y_train = np.zeros(2000)

In [67]:
files = os.listdir('aclImdb/train/pos')[:1000]
for i in range(1000):
    x_train[i] = model.docvecs['pos_' + files[i]]
    y_train[i] = 1

In [68]:
files = os.listdir('aclImdb/train/neg')[:1000]
for i in range(1000):
    x_train[i+1000] = model.docvecs['neg_' + files[i]]
    y_train[i+1000] = 0

In [69]:
print(x_train)
print(y_train)

[[ 0.16584291  0.17257604  0.11534312 ...,  0.12235802 -0.14331858
  -0.09695759]
 [ 0.05659055  0.06830832  0.00634666 ..., -0.08080529 -0.09798458
   0.08413909]
 [-0.02922071 -0.04415205  0.09993537 ...,  0.01076222 -0.10168039
   0.04471296]
 ..., 
 [-0.03804242 -0.01907704  0.05541293 ...,  0.12065005 -0.12281355
  -0.09457269]
 [ 0.12019002  0.15035789  0.29966483 ..., -0.00458205 -0.07029743
  -0.16000935]
 [ 0.18866819  0.19506106  0.13287938 ...,  0.09275244  0.10702503
  -0.05094117]]
[ 1.  1.  1. ...,  0.  0.  0.]


In [57]:
x_test = np.zeros((2000, 3000))
y_test = np.zeros(2000)

In [61]:
files = os.listdir('aclImdb/test/pos')[:1000]
for i in range(1000):
    x_test[i] = model.docvecs['pos_' + files[i]]
    y_test[i] = 1

In [63]:
files = os.listdir('aclImdb/test/neg')[:1000]
for i in range(1000):
    x_test[i+1000] = model.docvecs['neg_' + files[i]]
    y_test[i+1000] = 0

In [72]:
print(x_test)
print(y_test)

[[  1.56103536e-01   3.50850783e-02   1.43749878e-01 ...,   1.60564911e-02
   -1.42835140e-01   2.08358653e-02]
 [ -3.63038443e-02  -5.89871407e-02  -1.28717916e-02 ...,   1.73746347e-01
   -2.48201862e-02  -2.21466105e-02]
 [  4.17211205e-02   1.32784203e-01   1.97908059e-01 ...,   9.37963426e-02
   -1.26963094e-01  -1.27556175e-01]
 ..., 
 [ -2.05457285e-01  -1.50586903e-01  -1.05547339e-01 ...,  -2.06781668e-04
   -2.39328876e-01  -1.84734568e-01]
 [  1.20190024e-01   1.50357887e-01   2.99664825e-01 ...,  -4.58205165e-03
   -7.02974349e-02  -1.60009354e-01]
 [ -4.54753414e-02   3.91226709e-02   1.54420316e-01 ...,   7.63216168e-02
    1.85473725e-01   1.47429243e-01]]
[ 1.  1.  1. ...,  0.  0.  0.]


In [71]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [73]:
clf.score(x_test, y_test)

0.91849999999999998

## 실습 4-3-2. 생성한 Doc2Vec 모델을 활용하여, 전체 검증 데이터에 대한 감성분석을 수행한다
- 로지스틱 회귀 분류기(logistic regression classifier)를 활용하여 각 리뷰를 긍정/부정으로 분류한다
- 기계학습 알고리즘의 변수(feature)로는 Doc2Vec의 결과로 생성된 3000차원의 배열(array)를 활용한다

In [75]:
review_list = []
labels_list = []

In [76]:
files = os.listdir('aclImdb/train/pos')
for file in files:
    review = ''
    with open('aclImdb/train/pos/{}'.format(file), 'r', encoding = 'utf-8') as f:
        for word in word_tokenize(f.read()):
            if lemm.lemmatize(word) not in stop_words:
                review += ' ' + word
        f.close()
    review_list.append(review)
    labels_list.append('pos_' + file)
    
files = os.listdir('aclImdb/train/neg')
for file in files:
    review = ''
    with open('aclImdb/train/neg/{}'.format(file), 'r', encoding = 'utf-8') as f:
        for word in word_tokenize(f.read()):
            if lemm.lemmatize(word) not in stop_words:
                review += ' ' + word
        f.close()
    review_list.append(review)
    labels_list.append('neg_' + file)
    
files = os.listdir('aclImdb/test/pos')
for file in files:
    review = ''
    with open('aclImdb/test/pos/{}'.format(file), 'r', encoding = 'utf-8') as f:
        for word in word_tokenize(f.read()):
            if lemm.lemmatize(word) not in stop_words:
                review += ' ' + word
        f.close()
    review_list.append(review)
    labels_list.append('pos_' + file)
    
files = os.listdir('aclImdb/test/neg')
for file in files:
    review = ''
    with open('aclImdb/test/neg/{}'.format(file), 'r', encoding = 'utf-8') as f:
        for word in word_tokenize(f.read()):
            if lemm.lemmatize(word) not in stop_words:
                review += ' ' + word
        f.close()
    review_list.append(review)
    labels_list.append('neg_' + file)
    
it = LabeledLineSentence(doc_list = review_list, labels_list = labels_list)

model = Doc2Vec(size = 3000, window = 10, dm = 0, alpha=0.025, min_alpha=0.025, \
min_count=5, workers = multiprocessing.cpu_count(), )

model.build_vocab(it)

model.train(it, total_examples = 50000, epochs = 10)
model.save('full_Doc2Vec.model')

In [80]:
model = Doc2Vec.load('full_Doc2Vec.model')

In [78]:
x_train = np.zeros((25000, 3000))
y_train = np.zeros(25000)

In [81]:
files = os.listdir('aclImdb/train/pos')
for i in range(12500):
    x_train[i] = model.docvecs['pos_' + files[i]]
    y_train[i] = 1

In [82]:
files = os.listdir('aclImdb/train/neg')
for i in range(12500):
    x_train[i+12500] = model.docvecs['neg_' + files[i]]
    y_train[i+12500] = 0

In [83]:
x_test = np.zeros((25000, 3000))
y_test = np.zeros(25000)

In [84]:
files = os.listdir('aclImdb/test/pos')
for i in range(12500):
    x_test[i] = model.docvecs['pos_' + files[i]]
    y_test[i] = 1

In [85]:
files = os.listdir('aclImdb/test/neg')
for i in range(12500):
    x_test[i+12500] = model.docvecs['neg_' + files[i]]
    y_test[i+12500] = 0

In [86]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [87]:
clf.score(x_test, y_test)

0.79012000000000004