In [1]:
import requests

In [2]:
import lxml.html

In [3]:
reviews = []

url = 'http://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=39569&type=before'

for page in range(100, 500):
    res = requests.get(url.format(page))
    element = lxml.html.fromstring(res.text)
    for e in element.xpath('.//div[@class="score_result"]//li'):
        star = e.find('.//div[@class="star_score"]//em').text_content()
        comment = e.find('.//div[@class="score_reple"]//p').text_content()
        reviews.append([star, comment[3:]])

In [4]:
import csv
with open('review_before_D-War.csv', 'w', encoding='utf8') as f:
    w = csv.writer(f)
    w.writerow([ 'comment'])
    w.writerows(reviews)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
from konlpy.tag import Twitter

In [7]:
tagger = Twitter()

In [8]:
def get_word(text):
    nouns = tagger.nouns(text)
    return [noun for noun in nouns if len(noun) > 1]

In [9]:
cv = CountVectorizer(tokenizer=get_word, max_features=1000)

In [10]:
tdm = cv.fit_transform([r[1] for r in reviews])

In [11]:
noun_list = cv.get_feature_names()

In [12]:
import numpy

In [13]:
numpy.save('tdm.npy', tdm)

In [14]:
with open('nouns_before_D-War.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(noun_list))

In [15]:
tdm = numpy.load('tdm.npy').tolist()

In [16]:
with open('nouns_before_D-War.txt', encoding='utf8') as f:
    noun_list = f.read().splitlines()

In [17]:
from sklearn.cross_validation import train_test_split

In [18]:
stars = [int(r[0]) for r in reviews]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(tdm, stars, test_size=0.2, random_state=42)

In [20]:
from sklearn import linear_model

In [21]:
lm = linear_model.LinearRegression()

In [22]:
lm.fit (X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [23]:
import operator

In [24]:
def get_important_words(model, positive=True, n=8):
    return sorted(list(zip(noun_list, model.coef_)), key=operator.itemgetter(1), reverse=positive)[:n]

In [25]:
get_important_words(lm)

[('거가', 0.58625000000000005),
 ('기대', 0.58625000000000005),
 ('낚이', -0.20687500000000006),
 ('바보', -0.20687500000000006),
 ('승리', -0.20687500000000009),
 ('장난', -0.20687500000000009),
 ('진실', -0.20687500000000009),
 ('하나', -0.20687500000000009)]

In [26]:
get_important_words(lm,False)

[('승리', -0.20687500000000009),
 ('장난', -0.20687500000000009),
 ('진실', -0.20687500000000009),
 ('하나', -0.20687500000000009),
 ('낚이', -0.20687500000000006),
 ('바보', -0.20687500000000006),
 ('거가', 0.58625000000000005),
 ('기대', 0.58625000000000005)]

In [None]:
reviews = []

url = 'http://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=39569&type=after'

for page in range(100, 500):
    res = requests.get(url.format(page))
    element = lxml.html.fromstring(res.text)
    for e in element.xpath('.//div[@class="score_result"]//li'):
        star = e.find('.//div[@class="star_score"]//em').text_content()
        comment = e.find('.//div[@class="score_reple"]//p').text_content()
        reviews.append([star, comment[3:]])

In [45]:
import csv
with open('review_after_D-War.csv', 'w', encoding='utf8') as f:
    w = csv.writer(f)
    w.writerow(['star', 'comment'])
    w.writerows(reviews)

In [29]:
tagger = Twitter()

In [30]:
cv = CountVectorizer(tokenizer=get_word, max_features=1000)

In [31]:
tdm = cv.fit_transform([r[1] for r in reviews])

In [32]:
noun_list = cv.get_feature_names()

In [33]:
numpy.save('tdm.npy', tdm)

In [34]:
with open('nouns_after_D-War.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(noun_list))

In [35]:
tdm = numpy.load('tdm.npy').tolist()

In [36]:
with open('nouns_after_D-War.txt', encoding='utf8') as f:
    noun_list = f.read().splitlines()

In [37]:
stars = [int(r[0]) for r in reviews]

In [38]:
X_train, X_test, y_train, y_test = train_test_split(tdm, stars, test_size=0.2, random_state=42)

In [39]:
lm = linear_model.LinearRegression()

In [40]:
lm.fit (X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [41]:
def get_important_words(model, positive=True, n=8):
    return sorted(list(zip(noun_list, model.coef_)), key=operator.itemgetter(1), reverse=positive)[:n]

In [42]:
get_important_words(lm)

[('남성', 0.66001893939366019),
 ('달리', 0.66001893939366019),
 ('내부', 0.33000946969683009),
 ('리라', 0.33000946969683009),
 ('미술감독', 0.33000946969683009),
 ('배경', 0.33000946969683009),
 ('배우', 0.33000946969683009),
 ('상황', 0.33000946969683009)]

In [43]:
get_important_words(lm,False)

[('관람객', -1.5612499999999971),
 ('김태', -0.2701799242424246),
 ('당시', -0.2701799242424246),
 ('마코', -0.2701799242424246),
 ('투자', -0.2701799242424246),
 ('하정우', -0.2701799242424246),
 ('구원', -0.097578124999996921),
 ('라인', -0.097578124999996921)]