### newspaper analysis example

In [73]:
library(tidyverse)
library(tidytext)
library(KoNLP)
library(dplyr)

In [25]:
useSejongDic()

Backup was just finished!
370957 words dictionary was built.


### stop_words

In [4]:
stop_words <- read_delim("stop_words/stop_words.txt", delim="\t")

Parsed with column specification:
cols(
  lexicon = col_character(),
  word = col_character(),
  tag = col_character(),
  score = col_double()
)


In [6]:
stop_words  %>% head(5)

lexicon,word,tag,score
k_corpus,이,VCP,0.018279601
k_corpus,있,VA,0.011699048
k_corpus,하,VV,0.009773658
k_corpus,것,NNB,0.00973315
k_corpus,들,XSN,0.00689824


### sentiment_words

In [180]:
sentiments_words <- read_delim("sentiments/sentiment_dic.txt", delim="\t")

Parsed with column specification:
cols(
  일련번호 = col_double(),
  단어 = col_character(),
  감정범주 = col_character(),
  빈도 = col_double(),
  감정정도M = col_double(),
  감정정도STD = col_double()
)


In [186]:
sentiments_words  %>% filter(is.na(일련번호) == FALSE) -> sentiments_words

In [9]:
sentiments_expressive <- read_csv("sentiments/expressive-type.csv")
sentiments_expressive  %>% head(5)

Parsed with column specification:
cols(
  ngram = col_character(),
  freq = col_double(),
  `dir-action` = col_double(),
  `dir-explicit` = col_double(),
  `dir-speech` = col_double(),
  indirect = col_double(),
  `writing-device` = col_double(),
  max.value = col_character(),
  max.prop = col_double()
)


ngram,freq,dir-action,dir-explicit,dir-speech,indirect,writing-device,max.value,max.prop
가*/JKS,1,0,0.0,1.0,0.0,0.0,dir-speech,1.0
가*/VV,3,0,0.3333333,0.3333333,0.33333333,0.0,dir-explicit,0.3333333
가/JKC,17,0,0.4117647,0.4117647,0.17647059,0.0,dir-explicit,0.4117647
가/JKS,112,0,0.375,0.3125,0.28571429,0.02678571,dir-explicit,0.375
가/VV,11,0,0.0,0.8181818,0.09090909,0.09090909,dir-speech,0.8181818


In [10]:
sentiments_intensity <- read_csv("sentiments/intensity.csv")
sentiments_intensity  %>% head(5)

Parsed with column specification:
cols(
  ngram = col_character(),
  freq = col_double(),
  High = col_double(),
  Low = col_double(),
  Medium = col_double(),
  None = col_double(),
  max.value = col_character(),
  max.prop = col_double()
)


ngram,freq,High,Low,Medium,None,max.value,max.prop
가*/JKS,1,0.0,0.0,1.0,0.0,Medium,1.0
가*/VV,3,0.0,0.3333333,0.6666667,0.0,Medium,0.6666667
가/JKC,17,0.17647059,0.2352941,0.5882353,0.0,Medium,0.5882353
가/JKS,112,0.16071429,0.1071429,0.6964286,0.03571429,Medium,0.6964286
가/VV,11,0.09090909,0.1818182,0.6363636,0.09090909,Medium,0.6363636


In [11]:
sentiments_nested <- read_csv("sentiments/nested-order.csv")
sentiments_nested  %>% head(5)

Parsed with column specification:
cols(
  ngram = col_character(),
  freq = col_double(),
  `0` = col_double(),
  `1` = col_double(),
  `2` = col_double(),
  `3` = col_double(),
  max.value = col_double(),
  max.prop = col_double()
)


ngram,freq,0,1,2,3,max.value,max.prop
가*/JKS,1,0.0,1.0,0.0,0,1,1.0
가*/VV,3,0.6666667,0.3333333,0.0,0,0,0.6666667
가/JKC,17,0.3529412,0.5882353,0.05882353,0,1,0.5882353
가/JKS,112,0.4464286,0.5535714,0.0,0,1,0.5535714
가/VV,11,0.2727273,0.7272727,0.0,0,1,0.7272727


In [89]:
sentiments_polarity <- read_csv("sentiments/polarity.csv")
sentiments_polarity  %>% head(5)

Parsed with column specification:
cols(
  ngram = col_character(),
  freq = col_double(),
  COMP = col_double(),
  NEG = col_double(),
  NEUT = col_double(),
  None = col_double(),
  POS = col_double(),
  max.value = col_character(),
  max.prop = col_double()
)


ngram,freq,COMP,NEG,NEUT,None,POS,max.value,max.prop
가*/JKS,1,0.00000000,0.0000000,0.00000000,0.00000000,1.0000000,POS,1.0000000
가*/JKS;있/VV,1,0.00000000,0.0000000,0.00000000,0.00000000,1.0000000,POS,1.0000000
가*/JKS;있/VV;었/EP,1,0.00000000,0.0000000,0.00000000,0.00000000,1.0000000,POS,1.0000000
가*/VV,3,0.00000000,0.0000000,0.00000000,0.00000000,1.0000000,POS,1.0000000
가*/VV;ㄴ다*/EF,1,0.00000000,0.0000000,0.00000000,0.00000000,1.0000000,POS,1.0000000
가/JKC,17,0.00000000,0.4705882,0.23529412,0.00000000,0.2941176,NEG,0.4705882
가/JKC;되/VV,11,0.00000000,0.3636364,0.27272727,0.00000000,0.3636364,NEG,0.3636364
가/JKC;되/VV;ㄴ/ETM,2,0.00000000,0.0000000,1.00000000,0.00000000,0.0000000,NEUT,1.0000000
가/JKC;되/VV;ㄹ/ETM,1,0.00000000,0.0000000,1.00000000,0.00000000,0.0000000,NEUT,1.0000000
가/JKC;되/VV;어/EC,2,0.00000000,1.0000000,0.00000000,0.00000000,0.0000000,NEG,1.0000000


In [13]:
sentiments_sub_polarity <- read_csv("sentiments/subjectivity-polarity.csv")
sentiments_sub_polarity  %>% head(5)

Parsed with column specification:
cols(
  ngram = col_character(),
  freq = col_double(),
  COMP = col_double(),
  NEG = col_double(),
  NEUT = col_double(),
  POS = col_double(),
  max.value = col_character(),
  max.prop = col_double()
)


ngram,freq,COMP,NEG,NEUT,POS,max.value,max.prop
가*/JKS,1,0,0,0,1,POS,1
가*/JKS;있/VV,1,0,0,0,1,POS,1
가*/JKS;있/VV;었/EP,1,0,0,0,1,POS,1
가*/VV,3,0,0,0,1,POS,1
가*/VV;ㄴ다*/EF,1,0,0,0,1,POS,1


In [14]:
sentiments_sub_type <- read_csv("sentiments/subjectivity-type.csv")
sentiments_sub_type  %>% head(5)

Parsed with column specification:
cols(
  ngram = col_character(),
  freq = col_double(),
  Agreement = col_double(),
  Argument = col_double(),
  Emotion = col_double(),
  Intention = col_double(),
  Judgment = col_double(),
  Others = col_double(),
  Speculation = col_double(),
  max.value = col_character(),
  max.prop = col_double()
)


ngram,freq,Agreement,Argument,Emotion,Intention,Judgment,Others,Speculation,max.value,max.prop
가*/JKS,1,0.0,1.0,0.0,0.0,0.0,0.0,0,Argument,1.0
가*/VV,3,0.0,0.3333333,0.0,0.0,0.6666667,0.0,0,Judgment,0.6666667
가/JKC,17,0.058823529,0.3529412,0.0,0.0,0.5882353,0.0,0,Judgment,0.5882353
가/JKS,112,0.008928571,0.3303571,0.05357143,0.008928571,0.5714286,0.02678571,0,Judgment,0.5714286
가/VV,11,0.0,0.7272727,0.0,0.0,0.1818182,0.09090909,0,Argument,0.7272727


## data import

In [18]:
data_economy <- read_delim("data/txt_extract/경제.txt", delim="\t")
data_sewol <- read_delim("data/txt_extract/세월호.txt", delim="\t")
data_pay <- read_delim("data/txt_extract/최저임금.txt", delim="\t")
data_demo1 <- read_delim("data/txt_extract/민주당.txt", delim="\t")
data_demo2 <- read_delim("data/txt_extract/더불어민주당.txt", delim="\t")
data_libe1 <- read_delim("data/txt_extract/한국당.txt", delim="\t")
data_libe2 <- read_delim("data/txt_extract/자유한국당.txt", delim="\t")

mutate_01 <- function(df, keyword) {
    
    df  %>% mutate(검색어 = keyword) -> df
    return(df)
}

# mutate_01(data_economy, "경제")
# mutate_01(data_sewol, "세월호")
# mutate_01(data_pay, "최저임금")

bind_rows(mutate_01(data_economy, "경제"),
          mutate_01(data_sewol, "세월호"),
          mutate_01(data_pay, "최저임금"),
          mutate_01(data_demo1, "민주당"),
          mutate_01(data_demo2, "더불어민주당"),
          mutate_01(data_libe1, "한국당"),
          mutate_01(data_libe2, "자유한국당")
          ) -> data

(
    data
     %>% rename(pid = 식별자, date = 일자, news=언론사, title = 제목, article = 본문, keyword = 검색어)
) -> data

(
    data
     %>% mutate(id = row_number())
     %>% select(id,date,news,title, article, keyword)
#      %>% tail(5)
)-> data

data$keyword2 <- str_replace(data$keyword, "자유한국당", "한국당")

(
    data
     %>% mutate(keyword3 = str_replace(data$keyword2, "더불어민주당", "민주당"))
) -> data

(
    data
     %>% select(-keyword, -keyword2, keyword = keyword3)
#      %>% head(5)
) -> data

(
    data
     %>% mutate(year = substr(date,1,4), 
                month = substr(date,5,6),
               day = substr(date,7,8))
     %>% select(id, date,year, month, day, news, keyword, title, article)
#      %>% head(5)
) -> data


Parsed with column specification:
cols(
  식별자 = col_character(),
  일자 = col_double(),
  언론사 = col_character(),
  제목 = col_character(),
  본문 = col_character()
)
Parsed with column specification:
cols(
  식별자 = col_character(),
  일자 = col_double(),
  언론사 = col_character(),
  제목 = col_character(),
  본문 = col_character()
)
Parsed with column specification:
cols(
  식별자 = col_character(),
  일자 = col_double(),
  언론사 = col_character(),
  제목 = col_character(),
  본문 = col_character()
)
Parsed with column specification:
cols(
  식별자 = col_character(),
  일자 = col_double(),
  언론사 = col_character(),
  제목 = col_character(),
  본문 = col_character()
)
Parsed with column specification:
cols(
  식별자 = col_character(),
  일자 = col_double(),
  언론사 = col_character(),
  제목 = col_character(),
  본문 = col_character()
)
Parsed with column specification:
cols(
  식별자 = col_character(),
  일자 = col_double(),
  언론사 = col_character(),
  제목 = col_character(),
  본문 = col_character()
)
Parsed with column specification:
cols(


In [23]:
data  %>% head(1)

id,date,year,month,day,news,keyword,title,article
1,20190613,2019,6,13,강원일보,경제,"[사설]내년 총선, 경제 살리기 일자리 대책이 승부 갈라","정계개편이 10개월 앞으로 다가온 내년 4 15 총선의 최대 변수로 부상하고 있다. 우선 `보수 대통합'을 향해 보수진영이 꿈틀대고 있다. 현재 보수는 자유한국당과 옛 새누리당에 뿌리를 둔 바른정당계 바른미래당 의원 일부, 대한애국당 등으로 흩어져 있다. 내년 총선 전 보수진영의 통합이 이뤄질 경우 정계개편은 물론 중도층의 표심에도 적지 않은 파장을 불.."


In [27]:
(
    data
     %>% filter(news %in% c("조선일보", "경향신문"))
     %>% filter(year %in% c(2019,2018))
#      %>% group_by(news,year, keyword)
#      %>% summarise(CNT = n_distinct(id))
) -> t

In [28]:
(
    t
     %>% select(id, year, month, news, keyword, article)
     %>% unnest_tokens(input = article, output=simplepos22, token=SimplePos22)
#      %>% head(5)
) -> t_simplepos22

simplepos09, simplpos22 참조: https://brunch.co.kr/@mapthecity/9

In [83]:
(
    t_simplepos22
#      %>% filter(str_detect(simplepos22,"xs"))
#      %>% head(5)
)

id,year,month,news,keyword,simplepos22
9,2019,06,조선일보,경제,'/sr+경제/nc+가/jc
9,2019,06,조선일보,경제,탄탄하다'던/nc
9,2019,06,조선일보,경제,정부/nc
9,2019,06,조선일보,경제,여당/nc+이/jc
9,2019,06,조선일보,경제,갑자기/ma
9,2019,06,조선일보,경제,'/sr+대외/nc
9,2019,06,조선일보,경제,여건/nc
9,2019,06,조선일보,경제,악화/nc+'/sr+를/jc
9,2019,06,조선일보,경제,내세우/pv+면서/ec
9,2019,06,조선일보,경제,"""/sl+추경예산/nc"


In [88]:
(
    t_simplepos22
#      %>% filter(str_detect(simplepos22,"xs"))
     %>% separate(simplepos22, into=c("a","b","c","d","e","l"), sep="\\+", convert=TRUE, extra ="merge")
     %>% filter(is.na(l)==FALSE)
#      %>% head(5)
)

“Expected 6 pieces. Missing pieces filled with `NA` in 10364 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].”

id,year,month,news,keyword,a,b,c,d,e,l
9,2019,6,조선일보,경제,1/nn,만/jx,~/sd,2만/nn,개/nb,를/jc
27,2019,6,조선일보,경제,조치/nc,하/xs,겠/ep,다/ef,"""/sr",고/jc
40,2019,5,경향신문,경제,이/np,이/jp,어/ec,가/px,아/ep,"며/ec+,/sp"
40,2019,5,경향신문,경제,소득최하위/nc,층/xs,(/sl,1/nn,분/nb,위/xs+)/sr+과/jc
40,2019,5,경향신문,경제,소득최상위/nc,층/xs,(/sl,5/nn,분/nb,위/xs+)/sr
58,2019,5,조선일보,경제,낮/pa,아/ec,지/px,었/ep,다/ef,"""/sr+고/jc"
76,2019,5,조선일보,경제,유/nc,이/jp,지/ec,하/px,었/ep,"고/ec+,/sp"
155,2019,3,조선일보,경제,수/nb,만/jx,~/sd,수십만/nn,명/nb,씩/xs
164,2019,3,조선일보,경제,높/pa,아/ec,지/px,었/ep,다/ef,./sf
169,2019,3,조선일보,경제,개선/nc,되/xs,었/ep,다/ef,"""/sr",고/jc


In [95]:
(
    t_simplepos22
     %>% mutate(sp22_LEN = str_length(simplepos22))
     %>% arrange(desc(sp22_LEN))
#      %>% mutate(SEP_CNT = str_count(simplepos22, "+"))
     %>% head(10)
)

id,year,month,news,keyword,simplepos22,sp22_LEN
40,2019,5,경향신문,경제,소득최하위/nc+층/xs+(/sl+1/nn+분/nb+위/xs+)/sr+과/jc,43
8278,2019,1,조선일보,최저임금,10/nn+%/su+(/sl+6800/nn+여/xs+명/nb+)/sr+가/jc,43
10806,2018,7,조선일보,한국당,맡/pv+기/et+이/jp+겠/ep+다/ef+는/et+데/nb+도/jx,39
40,2019,5,경향신문,경제,소득최상위/nc+층/xs+(/sl+5/nn+분/nb+위/xs+)/sr,38
10290,2019,4,경향신문,한국당,신속처리안/nc+건/xs+(/sl+패스트트랙/nc+)/sr+으로/jc,38
704,2018,7,조선일보,경제,"""/sl+뼈아프/pa+다/ef+""/sr+면/nb+서/jc+도/jx",36
225,2019,2,조선일보,경제,탕감/nc+하/xs+어/ec+주/px+겠/ep+다/ef+는/et,35
7660,2018,4,조선일보,세월호,안전/nc+하/xs+어/ec+지/px+었/ep+다/ef+고/jc,35
10464,2018,12,경향신문,한국당,"이/np+이/jp+어/ec+가/px+아/ep+지만/ec+,/sp",35
40,2019,5,경향신문,경제,"이/np+이/jp+어/ec+가/px+아/ep+며/ec+,/sp",34


In [105]:
(
    t_simplepos22
#      %>% filter(str_detect(simplepos22,"xs"))
     %>% separate(simplepos22, into=c("c_01","c_02","c_03","c_04","c_05","c_06","c_07","c_08","c_09"), sep="\\+", convert=TRUE, extra ="merge",remove=FALSE)
#      %>% filter(is.na(c_09)==FALSE)
#      %>% head(5)
) -> t_simplepos22_sep

“Expected 9 pieces. Missing pieces filled with `NA` in 10407 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].”

In [129]:

(
    t_simplepos22_sep
     %>% tail(10)
#     %>% head(3)
)

id,year,month,news,keyword,simplepos22,c_01,c_02,c_03,c_04,c_05,c_06,c_07,c_08,c_09
10806,2018,7,조선일보,한국당,탄핵/nc,탄핵/nc,,,,,,,,
10806,2018,7,조선일보,한국당,선고/nc+를/jc,선고/nc,를/jc,,,,,,,
10806,2018,7,조선일보,한국당,내리/pv+ㄴ/et,내리/pv,ㄴ/et,,,,,,,
10806,2018,7,조선일보,한국당,이정미/nc,이정미/nc,,,,,,,,
10806,2018,7,조선일보,한국당,전/nb,전/nb,,,,,,,,
10806,2018,7,조선일보,한국당,헌법재판관/nc+을/jc,헌법재판관/nc,을/jc,,,,,,,
10806,2018,7,조선일보,한국당,포함해/nc,포함해/nc,,,,,,,,
10806,2018,7,조선일보,한국당,40/nn+명/nb+이/jc,40/nn,명/nb,이/jc,,,,,,
10806,2018,7,조선일보,한국당,널/pv+ㅁ/et+./sf,널/pv,ㅁ/et,./sf,,,,,,
10806,2018,7,조선일보,한국당,./sf,./sf,,,,,,,,


In [107]:
(
    t_simplepos22_sep
     %>% select(id,year,month, news, keyword, simplepos22, c_01)
)

id,year,month,news,keyword,simplepos22,c_01
9,2019,06,조선일보,경제,'/sr+경제/nc+가/jc,'/sr
9,2019,06,조선일보,경제,탄탄하다'던/nc,탄탄하다'던/nc
9,2019,06,조선일보,경제,정부/nc,정부/nc
9,2019,06,조선일보,경제,여당/nc+이/jc,여당/nc
9,2019,06,조선일보,경제,갑자기/ma,갑자기/ma
9,2019,06,조선일보,경제,'/sr+대외/nc,'/sr
9,2019,06,조선일보,경제,여건/nc,여건/nc
9,2019,06,조선일보,경제,악화/nc+'/sr+를/jc,악화/nc
9,2019,06,조선일보,경제,내세우/pv+면서/ec,내세우/pv
9,2019,06,조선일보,경제,"""/sl+추경예산/nc","""/sl"


In [117]:
c_merge <- function(column){
    t_simplepos22_sep %>% select(id,year,month, news, keyword, simplepos22, column) %>% rename(tag = column)
    
}

# c_merge("c_02")

In [130]:
c_merge("c_02")  %>% tail(3)

id,year,month,news,keyword,simplepos22,tag
10806,2018,7,조선일보,한국당,40/nn+명/nb+이/jc,명/nb
10806,2018,7,조선일보,한국당,널/pv+ㅁ/et+./sf,ㅁ/et
10806,2018,7,조선일보,한국당,./sf,


In [118]:
bind_rows(c_merge("c_01"),
         c_merge("c_02"),
         c_merge("c_03"),
         c_merge("c_04"),
         c_merge("c_05"),
         c_merge("c_06"),
         c_merge("c_07"),
         c_merge("c_08"),
         c_merge("c_09")) -> t_simplepos22_sep_merge


In [132]:
(
    t_simplepos22_sep_merge
     %>% head(10)
#      %>% filter(simplepos22 == "'/sr+경제/nc+가/jc")
)

id,year,month,news,keyword,simplepos22,tag
9,2019,6,조선일보,경제,'/sr+경제/nc+가/jc,'/sr
9,2019,6,조선일보,경제,탄탄하다'던/nc,탄탄하다'던/nc
9,2019,6,조선일보,경제,정부/nc,정부/nc
9,2019,6,조선일보,경제,여당/nc+이/jc,여당/nc
9,2019,6,조선일보,경제,갑자기/ma,갑자기/ma
9,2019,6,조선일보,경제,'/sr+대외/nc,'/sr
9,2019,6,조선일보,경제,여건/nc,여건/nc
9,2019,6,조선일보,경제,악화/nc+'/sr+를/jc,악화/nc
9,2019,6,조선일보,경제,내세우/pv+면서/ec,내세우/pv
9,2019,6,조선일보,경제,"""/sl+추경예산/nc","""/sl"


In [138]:
t_simplepos22_sep_merge  %>% filter(is.na(tag) == FALSE)  %>% filter(id==704)

id,year,month,news,keyword,simplepos22,tag
704,2018,07,조선일보,경제,민주당/nc,민주당/nc
704,2018,07,조선일보,경제,원내대표/nc+가/jc,원내대표/nc
704,2018,07,조선일보,경제,당/nc,당/nc
704,2018,07,조선일보,경제,회의/nc+에서/jc,회의/nc
704,2018,07,조선일보,경제,심각한/nc,심각한/nc
704,2018,07,조선일보,경제,고용난/nc+에/jc,고용난/nc
704,2018,07,조선일보,경제,대하/pv+어/ec,대하/pv
704,2018,07,조선일보,경제,"""/sl+뼈아프/pa+다/ef+""/sr+면/nb+서/jc+도/jx","""/sl"
704,2018,07,조선일보,경제,원인/nc+에/jc,원인/nc
704,2018,07,조선일보,경제,대하/pv+어선/ec,대하/pv


In [141]:
(
    t_simplepos22_sep_merge
     %>% filter(is.na(tag)==FALSE)
#      %>% head(3)
) -> t_simplepos22_sep_merge_filter

In [145]:
(
    t_simplepos22_sep_merge_filter
     %>% separate(tag, into=c("word","symbol"), sep="\\/", convert=TRUE, extra ="merge",remove=FALSE)
#      %>% mutate()
) ->t_simplepos22_sep_merge_filter_sep

In [147]:
(
    t_simplepos22_sep_merge_filter_sep
#      %>% head(5)
) -> t_final

In [152]:
(
    t_final
     %>% count(symbol, sort=TRUE)
     %>% head(10)
#      %>% filter(symbol == "nc")
#      %>% head(3)
)

symbol,n
nc,6345
jc,2634
et,1022
sf,1008
pv,858
nb,846
ef,843
ec,837
nn,775
jx,616


In [163]:
sentiments_sub_polarity  %>% filter(str_detect(ngram,'N'))

ngram,freq,COMP,NEG,NEUT,POS,max.value,max.prop
가/JKC;아니/VCN,6,0,0.6666667,0,0.3333333,NEG,0.6666667
가/JKC;아니/VCN;ㄴ가/EC,1,0,1.0000000,0,0.0000000,NEG,1.0000000
가/JKC;아니/VCN;면/EC,1,0,0.0000000,0,1.0000000,POS,1.0000000
가/JKC;아니/VCN;ㅂ니다/EF,1,0,0.0000000,0,1.0000000,POS,1.0000000
가/JKS;가능/NNG,1,0,0.0000000,0,1.0000000,POS,1.0000000
가/JKS;가능/NNG;하/XSA,1,0,0.0000000,0,1.0000000,POS,1.0000000
가/JKS;그/MM;동안/NNG,1,0,0.0000000,0,1.0000000,POS,1.0000000
가/JKS;극대/NNG,1,0,1.0000000,0,0.0000000,NEG,1.0000000
가/JKS;극대/NNG;화/XSN,1,0,1.0000000,0,0.0000000,NEG,1.0000000
가/JKS;꿈/NNG,1,0,0.0000000,1,0.0000000,NEUT,1.0000000


In [196]:
stop_words  %>% head(3)

lexicon,word,tag,score
k_corpus,이,VCP,0.018279601
k_corpus,있,VA,0.011699048
k_corpus,하,VV,0.009773658


In [177]:
(
    t_final
#      %>% filter(word =="있")
     %>% anti_join(stop_words, by="word")
#      %>% anti_join(stop_words)
     %>% count()
#      %>% head(3)
)

n
12513


In [189]:
(
    sentiments_words
     %>% select(id = 일련번호, word=단어, sentiment = 감정범주, freq=빈도, M_score = 감정정도M, STD_score = 감정정도STD)
) -> sentiments_words

In [195]:
(
    sentiments_words
     %>% unnest_tokens(input = word, output=tag, token=SimplePos22)
     %>% head(3)
)

id,sentiment,freq,M_score,STD_score,tag
1,혐오,47.5,4.95,2.46,가관/nc+이/jc
1,혐오,47.5,4.95,2.46,다/ma
2,슬픔,88.8,5.62,2.36,가련/nc+하/xs+어/ec


In [193]:
(
    t_final
     %>% head(5)
)

id,year,month,news,keyword,simplepos22,tag,word,symbol
9,2019,6,조선일보,경제,'/sr+경제/nc+가/jc,'/sr,',sr
9,2019,6,조선일보,경제,탄탄하다'던/nc,탄탄하다'던/nc,탄탄하다'던,nc
9,2019,6,조선일보,경제,정부/nc,정부/nc,정부,nc
9,2019,6,조선일보,경제,여당/nc+이/jc,여당/nc,여당,nc
9,2019,6,조선일보,경제,갑자기/ma,갑자기/ma,갑자기,ma


In [194]:
(
    stop_words
     %>% head(3)
)

lexicon,word,tag,score
k_corpus,이,VCP,0.018279601
k_corpus,있,VA,0.011699048
k_corpus,하,VV,0.009773658


In [198]:
(
    sentiments_sub_polarity
#      %>% head(3)
)

ngram,freq,COMP,NEG,NEUT,POS,max.value,max.prop
가*/JKS,1,0,0.0000000,0.00000000,1.0000000,POS,1.0000000
가*/JKS;있/VV,1,0,0.0000000,0.00000000,1.0000000,POS,1.0000000
가*/JKS;있/VV;었/EP,1,0,0.0000000,0.00000000,1.0000000,POS,1.0000000
가*/VV,3,0,0.0000000,0.00000000,1.0000000,POS,1.0000000
가*/VV;ㄴ다*/EF,1,0,0.0000000,0.00000000,1.0000000,POS,1.0000000
가/JKC,17,0,0.4117647,0.11764706,0.4705882,POS,0.4705882
가/JKC;되/VV,11,0,0.2727273,0.18181818,0.5454545,POS,0.5454545
가/JKC;되/VV;ㄴ/ETM,2,0,0.0000000,1.00000000,0.0000000,NEUT,1.0000000
가/JKC;되/VV;ㄹ/ETM,1,0,0.0000000,0.00000000,1.0000000,POS,1.0000000
가/JKC;되/VV;어/EC,2,0,0.5000000,0.00000000,0.5000000,NEG,0.5000000


In [204]:
(
    t_final
#      %>% count(news)
#      %>% count(keyword)
     %>% count(news,keyword,year)
#      %>% head(3)
    
)

news,keyword,year,n
경향신문,경제,2018,1719
경향신문,경제,2019,650
경향신문,민주당,2018,511
경향신문,민주당,2019,525
경향신문,세월호,2018,261
경향신문,세월호,2019,96
경향신문,최저임금,2018,879
경향신문,최저임금,2019,85
경향신문,한국당,2018,3118
경향신문,한국당,2019,2960


In [222]:
(
    t_final
     %>% anti_join(stop_words, by="word")
#      %>% count()
) -> t_final_wos

In [210]:
library(ggplot2)
library(gridExtra)
library(wordcloud)


Attaching package: ‘gridExtra’

The following object is masked from ‘package:dplyr’:

    combine

Loading required package: RColorBrewer


In [225]:
(
    t_final_wos
     %>% filter(news == "조선일보" & keyword == "경제")
     %>% group_by(word)
     %>% summarise(word_cnt = n_distinct(id))
     %>% arrange(desc(word_cnt))
    
#      %>% with(wordcloud(word, word_cnt, max.words=50))
)


word,word_cnt
.,49
다,49
었,48
는,44
ㄴ,37
은,35
도,26
ㄹ,25
경제,25
"""",24


In [230]:
(
    data
     %>% select(id, date, year, month, day, news, keyword, title)
     %>% unnest_tokens(input = title, output = nouns, token = extractNoun)
#      %>% head(3)
    
) -> data_token

In [231]:
(
    data_token
     %>% anti_join(S)
     %>% head(3)
)

id,date,year,month,day,news,keyword,nouns
1,20190613,2019,6,13,강원일보,경제,사설
1,20190613,2019,6,13,강원일보,경제,내년
1,20190613,2019,6,13,강원일보,경제,총선
