# 데이터 불러오기

## 원자료

In [1]:
import csv

In [2]:
with open('naver_review.csv', encoding='utf8') as f:
    w = csv.reader(f)
    next(w)
    reviews = list(w)

## TDM

In [3]:
import numpy

In [4]:
tdm = numpy.load('tdm.npy').tolist()

## 단어 목록

In [5]:
with open('nouns.txt', encoding='utf8') as f:
    noun_list = f.read().splitlines()

# training set / test set

In [6]:
from sklearn.cross_validation import train_test_split

In [7]:
stars = [int(r[0]) for r in reviews]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(tdm, stars, test_size=0.2, random_state=42)

# Linear Model

In [9]:
from sklearn import linear_model

In [10]:
lm = linear_model.LinearRegression()

In [11]:
lm.fit (X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## 결과 보기

In [12]:
import operator

In [13]:
def get_important_words(model, positive=True, n=8):
    return sorted(list(zip(noun_list, model.coef_)), key=operator.itemgetter(1), reverse=positive)[:n]

In [14]:
get_important_words(lm)

[('출연', 21.126844728702167),
 ('등장', 19.235301248022207),
 ('월광', 18.122941869896326),
 ('성도', 17.896878661970998),
 ('중심', 16.028102045516267),
 ('비교', 15.609075574584889),
 ('혼돈', 14.207378551063229),
 ('지릴뻔', 13.188020933476198)]

In [15]:
get_important_words(lm, False)

[('정체', -27.125870048395033),
 ('회수', -23.62647440097561),
 ('퀄리티', -19.124022046387577),
 ('흐트러진', -18.696977871855381),
 ('차지', -16.799745890714462),
 ('사탄', -15.082065879486869),
 ('당신', -14.072208625108733),
 ('개도', -13.796561536032987)]

In [16]:
lm.score(X_train, y_train)

0.62483192618898542

In [17]:
lm.score(X_test, y_test)

-1.9796767908845136

# Lasso regression

In [18]:
lasso = linear_model.Lasso(alpha=0.01)

In [19]:
lasso.fit (X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [20]:
get_important_words(lasso)

[('현혹', 0.88713581696690991),
 ('소름', 0.84968514539862283),
 ('한국', 0.70021990178298676),
 ('완전', 0.67825442406668324),
 ('최고', 0.60862345479841662),
 ('한번', 0.60141433009348277),
 ('대박', 0.58340521981655247),
 ('나홍진', 0.54710065882888215)]

In [21]:
get_important_words(lasso, False)

[('쓰레기', -3.0157856495097417),
 ('최악', -2.8314911049859814),
 ('실망', -2.3545628264857945),
 ('진심', -2.1289169617709827),
 ('별로', -2.1019194392410894),
 ('평론가', -1.6022189468578882),
 ('스트레스', -1.5782762715387715),
 ('노잼', -1.4914868801606267)]

In [22]:
lasso.score(X_train, y_train)

0.22711072765475204

In [23]:
lasso.score(X_test, y_test)

0.16421563673695438

# Ridge Regression

In [24]:
ridge = linear_model.Ridge(alpha=10)

In [25]:
ridge.fit (X_train, y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [26]:
get_important_words(ridge)

[('현혹', 1.0615446772037),
 ('꿀잼', 1.045488155737704),
 ('한국', 1.019132347008276),
 ('완전', 0.92901253970836439),
 ('대박', 0.90294861060660192),
 ('소름', 0.87968533427780815),
 ('상영', 0.85783738307155033),
 ('오랜만', 0.78781656882893403)]

In [27]:
get_important_words(ridge, False)

[('최악', -2.1836060802094979),
 ('쓰레기', -2.1794533649184342),
 ('별로', -2.0442161245420194),
 ('실망', -1.9434633868501849),
 ('진심', -1.7660255907931675),
 ('노잼', -1.6403875727265937),
 ('스트레스', -1.6039323298229435),
 ('평론가', -1.5403047217963404)]

In [28]:
ridge.score(X_train, y_train)

0.39118057232479175

In [29]:
ridge.score(X_test, y_test)

0.18772341001452431

# LassoCV

In [30]:
lassocv = linear_model.LassoCV()

In [31]:
lassocv.fit (X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [32]:
lassocv.alpha_

0.0044042722467688596

In [33]:
get_important_words(lassocv)

[('현혹', 1.1542006907606654),
 ('꿀잼', 1.0870515552945328),
 ('한국', 1.0249185904484286),
 ('완전', 0.98426458718318577),
 ('대박', 0.92037409166312578),
 ('소름', 0.87429295078965152),
 ('상영', 0.73455912085431208),
 ('오랜만', 0.70759770353490214)]

In [34]:
get_important_words(lassocv, False)

[('쓰레기', -4.3859818525450258),
 ('최악', -3.715644334789411),
 ('예수', -3.5464908051029314),
 ('진심', -2.9211490020643791),
 ('실망', -2.8420014535860054),
 ('평론가', -2.7031547321076626),
 ('점도', -2.6163250357422703),
 ('페이', -2.3182003406731408)]

In [35]:
lassocv.score(X_train, y_train)

0.32831600665376703

In [36]:
lassocv.score(X_test, y_test)

0.16900825071460457

# RidgeCV

In [37]:
ridgecv = linear_model.RidgeCV(alphas=[.001, .01, .1, 1, 10, 100])

In [38]:
ridgecv.fit (X_train, y_train)

RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100], cv=None, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [39]:
ridgecv.alpha_

10.0

In [40]:
get_important_words(ridgecv)

[('현혹', 1.0854353691558312),
 ('꿀잼', 1.0710396336242256),
 ('한국', 1.0355944863167748),
 ('완전', 0.94811057983410718),
 ('대박', 0.91198377663872521),
 ('소름', 0.90322135536274195),
 ('상영', 0.85590110552375742),
 ('오랜만', 0.80137135972941809)]

In [41]:
get_important_words(ridgecv, False)

[('최악', -2.1723033264913605),
 ('쓰레기', -2.1690158639571986),
 ('별로', -2.0209520378676804),
 ('실망', -1.9358181725786252),
 ('진심', -1.7500396804144183),
 ('노잼', -1.6294530240951346),
 ('스트레스', -1.6051898158444842),
 ('평론가', -1.5355451213565678)]

In [42]:
ridgecv.score(X_train, y_train)

0.39119389928724546

In [43]:
ridgecv.score(X_test, y_test)

0.18860527618885292

# Elastic Net

In [44]:
elastic = linear_model.ElasticNetCV(l1_ratio=numpy.arange(.1, 1.0, .1))

In [46]:
elastic.fit(X_train, y_train)

ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9]),
       max_iter=1000, n_alphas=100, n_jobs=1, normalize=False,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=0)

In [47]:
elastic.alpha_

0.0041454505711805917

In [48]:
elastic.l1_ratio_

0.40000000000000002

In [49]:
get_important_words(elastic)

[('현혹', 1.1417954841638336),
 ('꿀잼', 1.1078209495543085),
 ('한국', 1.0526229849609217),
 ('완전', 0.98976314958602352),
 ('대박', 0.95971897262172479),
 ('상영', 0.9156708687214834),
 ('소름', 0.88040995609757122),
 ('오랜만', 0.8061532529447677)]

In [50]:
get_important_words(elastic, False)

[('쓰레기', -2.9382010752555678),
 ('최악', -2.7683799989100377),
 ('실망', -2.3606284276357599),
 ('진심', -2.2018255403600566),
 ('별로', -2.1629749288301507),
 ('평론가', -1.9633450718613386),
 ('스트레스', -1.8238572339697352),
 ('노잼', -1.7803492039589577)]

In [51]:
elastic.score(X_train, y_train)

0.37185960289666742

In [52]:
elastic.score(X_test, y_test)

0.18370877610542646

# 읽을 거리

더 자세한 내용은 scikit-learn 홈페이지의 [Supervised Learning](http://scikit-learn.org/stable/supervised_learning.html) 참조