In [None]:
import pandas as pd
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
from future.utils import iteritems
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import numpy as np
import datetime

import networkx as nx

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

font_name = fm.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
#rc('font', family=font_name)

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook
  from pandas import Panel


In [None]:
year = 2019
path = "./data/NaverNews{}/".format(year)
load_path = path + "/NaverNewsSTW{}.csv".format(year)
save_path = path + "/NaverNewsTF_IDF{}.csv".format(year)

In [None]:
df = pd.read_csv(load_path)
df = df[["Date", "Title", "Similar_Body", "stopwords"]]
df = df.dropna()

In [None]:
df.head(3)

Unnamed: 0,Date,Title,Similar_Body,stopwords
0,2019.10.01.,"""앗 따가워!"" 정전기 잘 통하는 사람들의 공통점",김대리가 생활 속 꿀팁을 전합니다 어머니 아버지 싱글족 직장인 등 다양한 모습의 김...,"['김대', '리가', '생활', '꿀팁', '전', '어머니', '아버지', '싱..."
1,2019.10.01.,태풍 '미탁' 영향으로 목포항구축제 하루 늦춰 4일 개막,지난해 목포항구축제 행사장에 설치된 어등터널 photonewsis 제 태풍 미탁의 ...,"['목포항구축제', '설치', '어등', '터널', '제', '태풍미탁', '영향'..."
2,2019.10.08.,또 돌아온 환절기… 영양제 꼭 먹어야 하나,김대리가 생활 속 꿀팁을 전합니다 어머니 아버지 싱글족 직장인 등 다양한 모습의 김...,"['김대', '리가', '생활', '꿀팁', '전', '어머니', '아버지', '싱..."


In [None]:
def MakeList(data):
    com = data.split(",")
    for i in range(len(com)):
        if i == 0:
            com[i] = com[i][2:-1]
        elif i == len(com)-1:
            com[i] = com[i][2:-2]
        else:
            com[i] = com[i][2:-1] 
    return com

In [None]:
df["MakeLisk"] = df["stopwords"].progress_apply(MakeList)

HBox(children=(FloatProgress(value=0.0, max=4376.0), HTML(value='')))




# TF, TF-IDF 함수 형식에 맞는 데이터 만들기

In [None]:
documents = df["MakeLisk"].tolist()

In [None]:
ListDocuments = []
def WordCount(data):
    join_words = " ".join(data)
    ListDocuments.append(join_words)
    return ListDocuments

In [None]:
for doc in documents:
    WordCount(doc)

# TF - IDF

In [None]:
max_data = 1000
vect = TfidfVectorizer(max_features = max_data, max_df=0.95, min_df=0)

In [None]:
tfidfv  = vect.fit_transform(ListDocuments)

In [None]:
words = vect.get_feature_names()

In [None]:
data_array = tfidfv.toarray()

In [None]:
data = pd.DataFrame(data_array, columns = words)

In [None]:
data.head(3)

Unnamed: 0,가격,가능성,가량,가방,가수,가오슝,가을,가이드,가입,가정,...,회원,회의,회장,효과,후원,휴가,휴식,흑자,희망,힐링
0,0.0,0.0,0.0,0.0,0.0,0.0,0.144323,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.091417,0.0,0.115552,0.0,0.0,0.0,...,0.0,0.084186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.086714,0.0,0.0,0.069685,...,0.0,0.0,0.0,0.115988,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
count_mat = tfidfv.sum(axis=0)
count_mat.shape

(1, 1000)

In [None]:
count = np.squeeze(np.array(count_mat))
count.shape

(1000,)

In [None]:
word_count = list(zip(words, count))

In [None]:
word_corr = np.corrcoef(tfidfv.todense(), rowvar=0)

In [None]:
edges = []
for i in range(len(words)):
    for j in range(i+1, len(words)):
        edges.append((words[i], words[j], word_corr[i,j]))

In [None]:
edges = sorted(edges, key = lambda x :  x[2], reverse = True)
edges[:5]

[('꿀잼', '우스', 0.8366516537301812),
 ('홍역', '환자', 0.6933104186088495),
 ('가입', '보험', 0.6756690154816428),
 ('창작', '키워드', 0.6729596603976994),
 ('민속', '어촌', 0.6699668409154019)]

In [None]:
edge_list = [(word1, word2) for word1, word2, weight in edges]
edge_list[:5]

[('꿀잼', '우스'), ('홍역', '환자'), ('가입', '보험'), ('창작', '키워드'), ('민속', '어촌')]

In [None]:
weight_list = [weight*7 for _, _, weight in edges]
weight_list[:5]

[5.856561576111269,
 4.853172930261946,
 4.7296831083715,
 4.710717622783895,
 4.689767886407814]

# 시각화

In [None]:
G = nx.Graph()

In [None]:
G

<networkx.classes.graph.Graph at 0x14360508e50>

In [None]:
position = nx.spring_layout(G, k = 0.99, iterations = 1000)

In [None]:
position

In [None]:
node = position.keys()
node

In [None]:
test = dict(G.degree())

In [None]:
test.keys()

In [None]:
test.values()

In [None]:
node_size = {}
def NodeSize(edges, node):
    for s, e, w in edges:
        if s in node:
            if s not in node_size.keys():
                node_size[s] = w
            else:
                hap = node_size[s] + w
                node_size[s] = hap
                       

In [None]:
NodeSize(edges, node)

In [None]:
node_size.values()

In [None]:
edge_set = set()
for word1, word2, weight in edges[:max_data+1]:
    G.add_edge(word1, word2, weight = weight)
    edge_set.add((word1, word2))

In [None]:
plt.figure(figsize = (100, 75))
#plt.figure(figsize = (12, 9))
#nx.draw_networkx_nodes(G, position, node_size=[((x*5)+5)*30 for x in node_size.values()], node_color = "red")
#nx.draw_networkx_nodes(G, position, node_size= 100, node_color = "red")
nx.draw_networkx_nodes(G, position, node_size= test, node_color = "red")
nx.draw_networkx_edges(G, position, edgelist = edge_list[:max_data+1], width = weight_list[:max_data+1], edge_color = 'blue')
nx.draw_networkx_labels(G, position, font_size = 70, font_family=font_name)
plt.axis('off')
now = datetime.datetime.now()
save_png = path + "/graph/NaverNewsTDMGraph{0}_{1}{2}{3}_{4}{5}{6}.png".format(year, now.year, now.month, now.day, now.hour, now.minute, now.second)
plt.savefig(save_png)
plt.show()

In [None]:
plt.figure(figsize = (400, 300))
#plt.figure(figsize = (12, 9))
#nx.draw_networkx_nodes(G, position, node_size=[((x*5)+5)*30 for x in node_size.values()], node_color = "red")
#nx.draw_networkx_nodes(G, position, node_size=[(x+5)*10 for x in node_size.values()], node_color = "red")
nx.draw_networkx_nodes(G, position, node_size= 100, node_color = "red")
nx.draw_networkx_edges(G, position, edgelist = edge_list[:max_data+1], width = weight_list[:max_data+1], edge_color = 'blue')
nx.draw_networkx_labels(G, position, font_size = 20, font_family=font_name)
plt.axis('off')
now = datetime.datetime.now()
save_png = path + "/graph/NaverNewsTDMGraph{0}_{1}{2}{3}_{4}{5}{6}.png".format(year, now.year, now.month, now.day, now.hour, now.minute, now.second)
plt.savefig(save_png)
plt.show()