# 국민대 빅데이터경영MBA '비정형 빅데이터 분석' 수업 과제.

Update : 2016-06-18, 최희윤(U2015049).

# 분석에 필요한 패키지 준비

In [3]:
import requests

In [4]:
import lxml.html

In [5]:
import csv

In [6]:
from konlpy.tag import Twitter

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
import numpy

In [9]:
import operator

# 웹크롤링

아래는 네이버 뉴스에서 '정치'란 키워드로 검색해 나온 기사 제목들을 크롤링한 것입니다.

In [42]:
url = 'http://news.naver.com/main/search/search.nhn?refresh=&so=rel.dsc&stPhoto=&stPaper=&stRelease=&ie=MS949&detail=0&rcsection=&query=%C1%A4%C4%A1&x=0&y=0&sm=all.basic&pd=1&startDate=&endDate='

In [43]:
with open('politics.csv', 'w', encoding='utf8') as f:
    writer = csv.writer(f)
    for i in range(1, 21):
        res = requests.get(url.format(i))  # 기사 목록
        element = lxml.html.fromstring(res.text)
        for news_link in element.xpath('.//a[@class="go_naver"]'):
            try:
                res = requests.get(news_link.attrib['href'])   # 네이버 뉴스 링크
                news = lxml.html.fromstring(res.text)
                body = news.xpath('.//div[@id="articleBodyContents"]')[0]
                writer.writerow([body.text_content()])
            except:
                continue

# 웹크롤링 결과를 불러와 분석 준비

In [44]:
politics = []
with open('politics.csv', encoding='utf8', newline='\r\n') as f:
    reader = csv.reader(f)
    for row in reader :
        politics.append(row[0])

# 형태소 분석기 준비

In [45]:
tagger = Twitter()

# Term-Document Matrix 만들기

In [46]:
cv = CountVectorizer(tokenizer=tagger.nouns, max_features=50)

In [47]:
tdf = cv.fit_transform(politics)

In [48]:
tdf

<180x50 sparse matrix of type '<class 'numpy.int64'>'
	with 5420 stored elements in Compressed Sparse Row format>

# 단어 목록

In [49]:
words = cv.get_feature_names()

In [50]:
words

['것',
 '계파',
 '고',
 '구두',
 '국민',
 '국민의당',
 '국정',
 '국회',
 '그',
 '금지',
 '기',
 '기자',
 '년',
 '논평',
 '다툼',
 '대변인',
 '대통령',
 '대표',
 '대한',
 '등',
 '라며',
 '며',
 '무단',
 '민생',
 '박',
 '박근혜',
 '배신',
 '배포',
 '복당',
 '새누리당',
 '서울',
 '수습',
 '안',
 '여당',
 '운동',
 '원내',
 '월',
 '유',
 '유승민',
 '의원',
 '일',
 '재',
 '전',
 '정치',
 '주도',
 '집권',
 '청와대',
 '친',
 '콕스',
 '현안']

# 한 글자 짜리 단어 제외하기

In [51]:
def get_word(doc):
    nouns = tagger.nouns(doc)
    return [noun for noun in nouns if len(noun) > 1]

In [52]:
cv = CountVectorizer(tokenizer=get_word, max_features=50)

In [53]:
tdf = cv.fit_transform(politics)

In [54]:
words = cv.get_feature_names()

In [55]:
words

['계파',
 '구두',
 '국민',
 '국민의당',
 '국정',
 '국회',
 '금지',
 '기자',
 '내분',
 '노동당',
 '논평',
 '다툼',
 '대변인',
 '대통령',
 '대표',
 '대한',
 '대해',
 '동민',
 '라며',
 '무단',
 '민생',
 '민주당',
 '박근혜',
 '발언',
 '배신',
 '배포',
 '복당',
 '사태',
 '새누리당',
 '서울',
 '수습',
 '여당',
 '영국',
 '오늘',
 '우리',
 '운동',
 '원내',
 '위해',
 '유승민',
 '의원',
 '전재',
 '정치',
 '주도',
 '증오',
 '집권',
 '책임',
 '청와대',
 '총선',
 '콕스',
 '현안']

# 단어별 출현 빈도 데이터 만들기

In [56]:
count_mat = tdf.sum(axis=0)

In [57]:
count_mat

matrix([[ 200,  220,  900,  180,  280,  240,  160,  160,  140,  140,  240,
          180,  440,  540,  240,  180,  160,  140,  220,  160,  340,  140,
          180,  120,  160,  160,  200,  140,  500,  180,  160,  360,  160,
          140,  160,  220,  380,  120,  240, 1180,  140,  280,  220,  120,
          200,  120,  240,  120,  280,  200]], dtype=int64)

In [58]:
count = numpy.squeeze(numpy.asarray(count_mat))

In [59]:
count

array([ 200,  220,  900,  180,  280,  240,  160,  160,  140,  140,  240,
        180,  440,  540,  240,  180,  160,  140,  220,  160,  340,  140,
        180,  120,  160,  160,  200,  140,  500,  180,  160,  360,  160,
        140,  160,  220,  380,  120,  240, 1180,  140,  280,  220,  120,
        200,  120,  240,  120,  280,  200], dtype=int64)

In [60]:
word_count = list(zip(words, count))

In [61]:
word_count

[('계파', 200),
 ('구두', 220),
 ('국민', 900),
 ('국민의당', 180),
 ('국정', 280),
 ('국회', 240),
 ('금지', 160),
 ('기자', 160),
 ('내분', 140),
 ('노동당', 140),
 ('논평', 240),
 ('다툼', 180),
 ('대변인', 440),
 ('대통령', 540),
 ('대표', 240),
 ('대한', 180),
 ('대해', 160),
 ('동민', 140),
 ('라며', 220),
 ('무단', 160),
 ('민생', 340),
 ('민주당', 140),
 ('박근혜', 180),
 ('발언', 120),
 ('배신', 160),
 ('배포', 160),
 ('복당', 200),
 ('사태', 140),
 ('새누리당', 500),
 ('서울', 180),
 ('수습', 160),
 ('여당', 360),
 ('영국', 160),
 ('오늘', 140),
 ('우리', 160),
 ('운동', 220),
 ('원내', 380),
 ('위해', 120),
 ('유승민', 240),
 ('의원', 1180),
 ('전재', 140),
 ('정치', 280),
 ('주도', 220),
 ('증오', 120),
 ('집권', 200),
 ('책임', 120),
 ('청와대', 240),
 ('총선', 120),
 ('콕스', 280),
 ('현안', 200)]

# 빈도순 정렬

In [62]:
sorted(word_count, key=operator.itemgetter(1), reverse=True)

[('의원', 1180),
 ('국민', 900),
 ('대통령', 540),
 ('새누리당', 500),
 ('대변인', 440),
 ('원내', 380),
 ('여당', 360),
 ('민생', 340),
 ('국정', 280),
 ('정치', 280),
 ('콕스', 280),
 ('국회', 240),
 ('논평', 240),
 ('대표', 240),
 ('유승민', 240),
 ('청와대', 240),
 ('구두', 220),
 ('라며', 220),
 ('운동', 220),
 ('주도', 220),
 ('계파', 200),
 ('복당', 200),
 ('집권', 200),
 ('현안', 200),
 ('국민의당', 180),
 ('다툼', 180),
 ('대한', 180),
 ('박근혜', 180),
 ('서울', 180),
 ('금지', 160),
 ('기자', 160),
 ('대해', 160),
 ('무단', 160),
 ('배신', 160),
 ('배포', 160),
 ('수습', 160),
 ('영국', 160),
 ('우리', 160),
 ('내분', 140),
 ('노동당', 140),
 ('동민', 140),
 ('민주당', 140),
 ('사태', 140),
 ('오늘', 140),
 ('전재', 140),
 ('발언', 120),
 ('위해', 120),
 ('증오', 120),
 ('책임', 120),
 ('총선', 120)]