# 네이버 영화평 수집


In [19]:
import requests
import lxml.html
import csv
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Kkma
import numpy
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
import operator

In [34]:
def get_word(text):
    nouns = tagger.nouns(text)
    return [noun for noun in nouns if len(noun) > 1 
            and noun!= "테스트"
            and noun!= "이곳"
            and noun!= "굿피플"
            and noun!= "우리"
            and noun!= "배우"
            and noun!= "걱정"
           ]

def get_important_words(model, positive=True, n=15):
    return sorted(list(zip(noun_list, model.coef_)), key=operator.itemgetter(1), reverse=positive)[:n]


In [35]:

campaigns = None
campaigns = []

# 네이버 해피빈 캠페인 주소
url = 'http://happybean.naver.com/happybeansearch/RaiseDonationSearch.nhn?query=&themeNo=&subThemeNo=&rdonastatus=4&sort=0&region=&page='

#money : 기부금액
#comment : 캠페인 내용

for page in range(1,700):
    res = requests.get(url.format(page))
    element = lxml.html.fromstring(res.text)
    for e in element.xpath('.//ul[@class="result_lst_area"]//li'):
        money = e.find('.//div[@class="info_area"]//em').text_content()
        money = money.replace(",","").replace("원 모금","")
        comment = e.findall('.//div[@class="txt_area"]//a')[1].text_content()
        campaigns.append([money, comment])

In [36]:
len(campaigns)

6990

In [258]:
#reviews
#campaigns

In [257]:
#campaigns

# Term-Document Matrix

In [37]:
tagger = Kkma()
cv = CountVectorizer(tokenizer=get_word, max_features=1000)
tdm = cv.fit_transform([r[1] for r in campaigns])
noun_list = cv.get_feature_names()

In [38]:
money = [int(r[0]) for r in campaigns]
X_train, X_test, y_train, y_test = train_test_split(tdm, money, test_size=0.2, random_state=42)
lm = linear_model.LinearRegression()    
lm.fit (X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [39]:
numpy.save('tdm.npy', tdm)

In [None]:
# 기부금이 많을 수록 어떤 단어가 많이 나오는지, 
#  사람들이 어떠한 단어나 이슈에 관심을 갖고 기부를 하는지 알수 있다.
# 직접적인 경제적인 어려움에 대한 문제를 언급했을때 기부금이 커지는것을 알 수 있고,
# 사회적인 문제나, 해외문제에 대해서는 상대적으로 관심이 적은것을 알수 있다.

In [40]:
get_important_words(lm)

[('가정의', 3567686.901715986),
 ('가족', 3567686.901715986),
 ('문화적', 3567686.901715986),
 ('서로', 3567686.901715986),
 ('선물', 3567686.901715986),
 ('소외', 3567686.901715986),
 ('저소득', 3567686.901715986),
 ('저소득가정의', 3567686.901715986),
 ('혜택', 3567686.901715986),
 ('경제적', 3469197.5147312875),
 ('아동', 3041722.0545917931),
 ('어려움', 2807871.3742601783),
 ('하루', 1524127.5132189491),
 ('가명', 233083.19378573811),
 ('가정', 233083.19378573811)]

In [41]:
get_important_words(lm, False)

[('사회', -2142048.7754818043),
 ('끼니', -2043559.3884970178),
 ('냉골', -2043559.3884970178),
 ('세상', -2043559.3884970178),
 ('치료', -1039026.0887913768),
 ('카나', -1039026.0887913768),
 ('케냐', -1039026.0887913768),
 ('투르카나', -1039026.0887913768),
 ('피플', -1039026.0887913768),
 ('희망', -1039026.0887913768),
 ('아이', -805942.89500568411),
 ('79', -759815.52745583002),
 ('79세', -759815.52745583002),
 ('기본', -759815.52745583002),
 ('기본적인', -759815.52745583002)]

In [42]:
ridgecv = linear_model.RidgeCV(alphas=[.001, .01, .1, 1, 10, 100])
ridgecv.fit (X_train, y_train)

RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100], cv=None, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [43]:
ridgecv.alpha_

0.001

In [44]:
get_important_words(ridgecv)

[('가정의', 3567686.3358207592),
 ('가족', 3567686.3358207592),
 ('문화적', 3567686.3358207592),
 ('서로', 3567686.3358207592),
 ('선물', 3567686.3358207592),
 ('소외', 3567686.3358207592),
 ('저소득', 3567686.3358207592),
 ('저소득가정의', 3567686.3358207592),
 ('혜택', 3567686.3358207592),
 ('경제적', 3469196.9431156465),
 ('아동', 3041721.5654045297),
 ('어려움', 2807870.9684514757),
 ('하루', 1524127.7906899159),
 ('가명', 233083.15189422472),
 ('가정', 233083.15189422472)]

In [45]:
get_important_words(ridgecv, False)

[('사회', -2142047.9378359746),
 ('끼니', -2043558.5451308857),
 ('냉골', -2043558.5451308857),
 ('세상', -2043558.5451308857),
 ('치료', -1039025.8139506911),
 ('카나', -1039025.8139506911),
 ('케냐', -1039025.8139506911),
 ('투르카나', -1039025.8139506911),
 ('피플', -1039025.8139506911),
 ('희망', -1039025.8139506911),
 ('아이', -805942.66205646703),
 ('79', -759815.36736930523),
 ('79세', -759815.36736930523),
 ('기본', -759815.36736930523),
 ('기본적인', -759815.36736930523)]

In [46]:
ridgecv.score(X_train, y_train)

0.97692449527146052

In [47]:
ridgecv.score(X_test, y_test)

0.97694953241225269