# 라이브러리

In [51]:
import pandas as pd
import re
import nltk
import itertools
import networkx as nx
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from collections import Counter

nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package movie_reviews is already

True

# 데이터 로드

In [52]:
# data load
raw = pd.read_excel('/Users/master/dev/PythonPr/news-word_cloud/crawl_result_v.1.xlsx')
raw = raw[['title', 'body', 'url']]

## 데이터 전처리

In [53]:
# text cleaning
raw['title'] = list(map(lambda x: re.sub('[^.,?!\s\w]','',x),raw['title']))
raw['body'] = list(map(lambda x: re.sub('[^.,?!\s\w]','',str(x)),raw['body']))

# case conversion
raw['title'] = list(map(lambda x: x.lower(), raw['title']))
raw['body'] = list(map(lambda x: x.lower(), raw['body']))

In [54]:
filtered_content = raw.copy()

# 토큰화

In [55]:
# Tokenization
full_body = ''.join(filtered_content['body'])
full_title = ''.join(filtered_content['title'])
full_text = ''.join([full_title, full_body])

word_tokens = nltk.word_tokenize(full_text)

# POS tagging
tokens_pos = nltk.pos_tag(word_tokens)

# select nouns
NN_words = []
for word, pos in tokens_pos:
    if 'NN' in pos:
        NN_words.append(word)
'''
# Lemmatization
wlem = nltk.WordNetLemmatizer()
lemmatized_words = []

for word in NN_words:
    new_word = wlem.lemmatize(word)
    lemmatized_words.append(new_word)
'''

'\n# Lemmatization\nwlem = nltk.WordNetLemmatizer()\nlemmatized_words = []\n\nfor word in NN_words:\n    new_word = wlem.lemmatize(word)\n    lemmatized_words.append(new_word)\n'

## Stop words 정리

In [56]:
# Stopwords removal
# 1차적으로 nltk에서 제공하는 불용어사전을 이용해서 불용어를 제거

stopwords_list = stopwords.words('english') #nltk에서 제공하는 불용어사전 이용
unique_NN_words = set(NN_words)   #set을 사용해 중복 제거
final_NN_words = NN_words

for word in unique_NN_words:
    if word in stopwords_list:
        while word in final_NN_words: final_NN_words.remove(word)


# 아래와 같이 추가로 직접 만든 불용어사전을 이용해 불용어 제거
customized_stopwords = ['be', 'today', 'yesterday', 'new', 'york', 'time']  # 직접 만든 불용어 사전
unique_NN_words1 = set(final_NN_words)

for word in unique_NN_words1:
    if word in customized_stopwords:
        while word in final_NN_words: final_NN_words.remove(word)


## final_NN_words 출력해보기
print(final_NN_words)



# Semantic Network 형성

### 최빈 단어 리스트 20개

In [57]:
c = Counter(final_NN_words)
list_of_words = []
for word, count in c.most_common(20):
    list_of_words.append(word)

print(f"가장 흔한 단어 20개: {list_of_words}")

가장 흔한 단어 20개: ['food', 'rice', 'kimchi', 'restaurant', 'dishes', 'restaurants', 'korea', 'dish', 'years', 'menu', 'kim', 'people', 'market', 'chicken', 'year', 'cuisine', 'ingredients', 'life', 'sauce', 'products']


### 문장 단위로 분리

In [58]:
# 원본 text 문장 단위로 쪼개기
sentences = full_text.split('\n')

def sentence_divide(target_symbol):
    result = []
    for sentence in sentences:
        result.extend(sentence.split(target_symbol))
    return result
sentences = sentence_divide('. ')
sentences = sentence_divide('!')
article_sentences = sentence_divide('?')

### 관계를 DataFrame으로 정리

In [59]:
'''
for sentence in article_sentences:
    word_tokens = nltk.word_tokenize(sentence)
    tokens_pos = nltk.pos_tag(word_tokens)

    NN_words = []
    for word, pos in tokens_pos:
        if "NN" in pos:
            NN_words.append(word)

    selected_words = []
    for word in NN_words:
        if word in list_of_words:
            selected_words.append(word)
    
    selected_words = set(selected_words)

    for pair in list(itertools.combinations(list(selected_words), 2)):
        # itertools.combinations: selected_words 리스트에서 2개씩 골라 조합을 만들어준다.
'''

'\nfor sentence in article_sentences:\n    word_tokens = nltk.word_tokenize(sentence)\n    tokens_pos = nltk.pos_tag(word_tokens)\n\n    NN_words = []\n    for word, pos in tokens_pos:\n        if "NN" in pos:\n            NN_words.append(word)\n\n    selected_words = []\n    for word in NN_words:\n        if word in list_of_words:\n            selected_words.append(word)\n    \n    selected_words = set(selected_words)\n\n    for pair in list(itertools.combinations(list(selected_words), 2)):\n        # itertools.combinations: selected_words 리스트에서 2개씩 골라 조합을 만들어준다.\n'

In [60]:
sentence = article_sentences[0]


word_tokens = nltk.word_tokenize(sentence)
tokens_pos = nltk.pos_tag(word_tokens)

NN_words = []
for word, pos in tokens_pos:
    if "NN" in pos:
        NN_words.append(word)

selected_words = []
for word in NN_words:
    if word in list_of_words:
        selected_words.append(word)

selected_words = set(selected_words)

word_combo = list(itertools.combinations(list(selected_words), 2))
type(word_combo)

list

In [61]:
article_sentences[0]



'501st military intelligence brigade strengthens alliance by hosting roku.s'

# 수정

In [2]:
import pandas as pd
import re
import nltk
import itertools
import networkx as nx
import matplotlib.pyplot as plt

nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/master/nltk_data...
[nltk_data]    |   Package movie_reviews is already

True

## 데이터 로드 및 소문자 변환

In [4]:
# data load
raw = pd.read_excel('/Users/master/dev/PythonPr/news-word_cloud/crawl_result_v.1.xlsx')
raw = raw[['title', 'body', 'url']]

# text cleaning
# raw['title'] = list(map(lambda x: re.sub('[^.,?!\s\w]','',x),raw['title']))
# raw['body'] = list(map(lambda x: re.sub('[^.,?!\s\w]','',str(x)),raw['body']))

# case conversion
raw['title'] = list(map(lambda x: x.lower(), raw['title']))
raw['body'] = list(map(lambda x:str(x).lower(), raw['body']))

filtered_content = raw.copy()

## 유니크 명사 뽑기

In [None]:
# Tokenization
full_body = ''.join(filtered_content['body'])
full_title = ''.join(filtered_content['title'])
full_text = ''.join([full_title, full_body])

word_tokens = nltk.word_tokenize(full_text)

# POS tagging
tokens_pos = nltk.pos_tag(word_tokens)

# select nouns
NN_words = []
for word, pos in tokens_pos:
    if 'NN' in pos:
        NN_words.append(word)
unique_NN_words = list(set(NN_words))


### Stop words 정리

In [None]:
stopwords_list = stopwords.words('english') #nltk에서 제공하는 불용어사전 이용
