In [63]:
import pandas as pd
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 데이터 로드
data = pd.read_csv(r'C:\Users\user\boyu571_github\01_Kakaobank_SKKU_Research_23\eng_20.csv')

# 형태소 분석기 초기화
okt = Okt()

# 기사의 내용을 형태소로 분석
data['tokenized_content'] = data['content'].apply(lambda x: ' '.join(okt.morphs(x)))

# TF-IDF 변환기를 초기화
tfidf_vectorizer = TfidfVectorizer()

# 형태소 분석된 내용을 벡터로 변환
tfidf_matrix = tfidf_vectorizer.fit_transform(data['tokenized_content'])

# 코사인 유사도 행렬 계산
cosine_sim = cosine_similarity(tfidf_matrix)

# 행렬의 통계 값 계산
# Compute statistics for the matrix
min_value = round(np.min(cosine_sim), 4)
max_value = round(np.max(cosine_sim), 4)
mean_value = round(np.mean(cosine_sim), 4)
median_value = round(np.median(cosine_sim), 4)
std_value = round(np.std(cosine_sim), 4)

min_value, max_value, mean_value, median_value, std_value

(0.0127, 1.0, 0.2894, 0.1647, 0.3401)

In [64]:
len(data)

10

In [None]:
# 군집화
clusters = []
visited = np.zeros(cosine_sim.shape[0], dtype=bool)
for i in range(cosine_sim.shape[0]):
    if not visited[i]:
        similar_articles = np.where(cosine_sim[i] > 0.6)[0]
        clusters.append(similar_articles)
        visited[similar_articles] = True

# 각 군집에서 대표 기사만 남기기
representative_indices = [cluster[0] for cluster in clusters if len(cluster) > 0]

# 대표 기사만 포함된 데이터프레임 생성
filtered_data = data.iloc[representative_indices].reset_index(drop=True)
print(len(filtered_data))
filtered_data

6


Unnamed: 0,time,title,content,press,link,UTC-time,UTC-date,tokenized_content
0,2017-11-04 09:56:45,잠잠한 북한…트럼프 순방기간 도발 가능성은,\r\n\t\t\t잠잠한 북한…트럼프 순방기간 도발 가능성은[앵커] 북한은 최근 5...,연합뉴스TV언론사 선정,https://n.news.naver.com/mnews/article/422/000...,2017-11-04 00:56:45,2017-11-04,\r\n\t\t\t 잠잠한 북한 … 트럼프 순방 기간 도발 가능성 은 [ 앵커 ] ...
1,2017-11-04 09:01:08,CME 비트코인 거래 개시 소식에 계좌 개설 열풍,\n\n\n\n\n[아시아경제 황준호 기자] 세계 최대 파생상품 거래소인 미국 시...,아시아경제,https://n.news.naver.com/mnews/article/277/000...,2017-11-04 00:01:08,2017-11-04,\n\n\n\n\n [ 아시아 경제 황준호 기자 ] 세계 최대 파생상품 거래소 인 ...
2,2017-11-04 15:37:27,"스파크랩, 1천억 규모 블록체인 투자 펀드 출범",\n\t\t\t 스파크랩 그룹은 블록체인과 암호화 화폐 기업 투자 펀드인 ‘스파크체...,파이낸셜뉴스,https://n.news.naver.com/mnews/article/014/000...,2017-11-04 06:37:27,2017-11-04,\n\t\t\t 스파크 랩 그룹 은 블록 체인 과 암호 화 화폐 기업 투자 펀드 ...
3,2017-11-04 19:55:00,"""현직 경찰관에 배우 동원까지""…가상화폐 범죄 기승",\r\n\t\t\t【 앵커멘트 】 하루 거래액이 2조 원을 넘어서면서 비트코인 등...,MBN,https://n.news.naver.com/mnews/article/057/000...,2017-11-04 10:55:00,2017-11-04,\r\n\t\t\t 【 앵커 멘트 】 하루 거래 액 이 2조 원 을 넘어서면서 비트...
4,2017-11-05 01:34:20,"가상화폐, 금융시장의 새로운 쓰나미",\r\n비트코인·이더리움 등 1000종류상위 10개 시가총액만 150조원빠르고 거래...,T19면 TOP,https://n.news.naver.com/mnews/article/353/000...,2017-11-04 16:34:20,2017-11-04,\r\n 비트코인 · 이 더 리움 등 1000 종류 상위 10 개 시가총액 만 15...
5,2017-11-05 06:00:00,[박수찬의 軍]대통령 공약과 따로 노는 국방부의 北 미사일 대책,\r\n\r\n\r\n\r\n\r\n천궁 지대공미사일이 2일 충남 보령 대천사격장에...,세계일보언론사 선정,https://n.news.naver.com/mnews/article/022/000...,2017-11-04 21:00:00,2017-11-04,\r\n\r\n\r\n\r\n\r\n 천궁 지 대 공 미사일 이 2일 충남 보령 대...


In [67]:
filtered_data.to_csv('filtered_20171104_01.csv', encoding='utf-8-sig', index=False)

In [7]:
data = pd.read_csv(r'C:\Users\user\boyu571_github\01_Kakaobank_SKKU_Research_23\eng_20.csv', lineterminator='\n')
data

Unnamed: 0,source,author,title,description,url,urlToImage,time,content,press,content_full,date\r
0,"{'id': None, 'name': ""Barron's""}",Ed Lin,Hunting for Undervalued REITs,Stiller Beobachter | Flickr Stiller Beobachter...,https://www.barrons.com/articles/hunting-for-u...,https://asset.barrons.com/public/resources/ima...,2018-09-15 00:45:26+00:00,Shares of real estate investment trusts have s...,Barron's,Shares of real estate investment trusts have s...,2018-09-15\r
1,"{'id': None, 'name': 'Bitrebels.com'}",Brenda Vollman,5 Great Reasons To Buy And Hold Cryptocurrenci...,How many times have you heard someone say that...,http://www.bitrebels.com/business/5-reasons-bu...,http://www.bitrebels.com/wp-content/uploads/20...,2018-09-15 01:00:21+00:00,How many times have you heard someone say that...,Bitrebels.com,How many times have you heard someone say that...,2018-09-15\r
2,"{'id': None, 'name': 'Independent.ie'}",Sean O'Grady,Westlife star warns fans off Web scam,Singer Shane Filan has warned his fans about a...,https://www.independent.ie/irish-news/westlife...,https://www.independent.ie/incoming/article373...,2018-09-15 01:30:00+00:00,Singer Shane Filan has warned his fans about a...,Independent.ie,Singer Shane Filan has warned his fans about a...,2018-09-15\r
3,"{'id': None, 'name': 'New Zealand Herald'}",newsfeeds@nzherald.co.nz,Comment: Euro pipe dream holds no currency,"COMMENT: Over the next ten years, it will beco...",https://www.nzherald.co.nz/business/news/artic...,https://www.nzherald.co.nz/resizer/MrJmsEMrF2l...,2018-09-15 01:34:39+00:00,"COMMENT: Over the next ten years, it will beco...",New Zealand Herald,"COMMENT: Over the next ten years, it will beco...",2018-09-15\r
4,"{'id': None, 'name': 'Sputnik International'}",Sputnik,OAS Chief Not Ruling Out Military Invasion of ...,The head of the Organization of American State...,https://sputniknews.com/latam/2018091510680529...,https://cdn5.img.sputniknews.com/images/106755...,2018-09-15 01:56:00+00:00,The head of the Organization of American State...,Sputnik International,"""With regards to a military intervention aimed...",2018-09-15\r
5,"{'id': None, 'name': 'newsBTC'}",Dalmas Ngetich,Cardano Price Analysis: Cardano Buyers Accumulate,There are low time frame recoveries but it loo...,https://www.newsbtc.com/2018/09/15/cardano-pri...,https://s3.amazonaws.com/main-newsbtc-images/2...,2018-09-15 02:00:07+00:00,There are low time frame recoveries but it loo...,newsBTC,There are low time frame recoveries but it loo...,2018-09-15\r
6,"{'id': None, 'name': 'Yahoo Entertainment'}",,"Bitcoin Cash, Litecoin and Ripple Daily Analys...",It’s a positive start to the weekend for the m...,https://finance.yahoo.com/news/bitcoin-cash-li...,https://s.yimg.com/uu/api/res/1.2/qjRDFY7ZdfVP...,2018-09-15 03:58:52+00:00,Bitcoin Cash Resumes Recovery Bitcoin Cash fel...,Yahoo Entertainment,Bitcoin Cash Resumes RecoveryBitcoin Cash fell...,2018-09-15\r
7,"{'id': 'the-irish-times', 'name': 'The Irish T...",Aifric Campbell,"Crash, Crisis, Crypto: a short history of inve...",Short story by Aifric Campbell marking the 10t...,https://www.irishtimes.com/culture/books/crash...,https://www.irishtimes.com/image-creator/?id=1...,2018-09-15 05:00:00+00:00,I walk into the den where my two girls are lyi...,The Irish Times,I walk into the den where my two girls are lyi...,2018-09-15\r
8,"{'id': None, 'name': 'newsBTC'}",Rick D.,"Bitcoin Hit $20k in 2017 Without Institutions,...",Despite an overwhelming sense of bearishness i...,https://www.newsbtc.com/2018/09/15/bitcoin-hit...,https://s3.amazonaws.com/main-newsbtc-images/2...,2018-09-15 08:30:54+00:00,Despite an overwhelming sense of bearishness i...,newsBTC,Despite an overwhelming sense of bearishness i...,2018-09-15\r
9,"{'id': None, 'name': 'CoinDesk'}",Alyssa Hertig,Grin Is Hard Forking Every Six Months to Keep ...,"Amid crypto's war on ASICs, the Grin community...",https://www.coindesk.com/grin-is-hard-forking-...,https://media.coindesk.com/uploads/2018/09/raz...,2018-09-15 09:15:21+00:00,The Grin community is trying to keep ASICs awa...,CoinDesk,The Grin community is trying to keep ASICs awa...,2018-09-15\r


In [42]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import numpy as np
from nltk.tokenize import sent_tokenize
import torch

# 필요한 경우 NLTK 데이터를 다운로드합니다.
import nltk

# 데이터 로드
data = pd.read_csv(r'C:\Users\user\boyu571_github\01_Kakaobank_SKKU_Research_23\eng_250.csv', lineterminator='\n')

# sBERT 모델 로드
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# 문서를 문장으로 나누고 각 문장에 대해 임베딩을 생성합니다.
document_embeddings = []
for content in data['content_full']:
    sentences = sent_tokenize(content)  # 문장으로 분할
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)  # 문장 임베딩 생성
    document_embedding = torch.mean(sentence_embeddings, dim=0)  # 문장 임베딩을 합산하여 문서 임베딩 생성
    document_embeddings.append(document_embedding)

# 문서 임베딩으로 유사도 행렬을 계산합니다.
similarity_matrix = []
for embedding1 in document_embeddings:
    row = []
    for embedding2 in document_embeddings:
        similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
        row.append(similarity)
    similarity_matrix.append(row)

# 행렬의 통계 값 계산
min_value = round(np.min(similarity_matrix), 4)
max_value = round(np.max(similarity_matrix), 4)
mean_value = round(np.mean(similarity_matrix), 4)
median_value = round(np.median(similarity_matrix), 4)
std_value = round(np.std(similarity_matrix), 4)

min_value, max_value, mean_value, median_value, std_value


(0.0098, 1.0, 0.4499, 0.4469, 0.1273)

In [None]:
from nltk.tokenize import sent_tokenize

# Assuming 'data' contains full articles, not individual sentences
sentences = [sent_tokenize(article) for article in data['content_full'].tolist()]

# Flatten the list of lists into a single list of sentences
sentences = [sentence for sublist in sentences for sentence in sublist]

# Now encode these sentences
embeddings = model.encode(sentences, convert_to_tensor=True)

In [43]:
# 군집화
clusters = []
visited = np.zeros(len(data), dtype=bool)
for i in range(len(data)):
    if not visited[i]:
        similar_articles = np.where(np.array(similarity_matrix[i]) > 0.6)[0]
        clusters.append(similar_articles)
        visited[similar_articles] = True

# 각 군집에서 대표 기사만 남기기
representative_indices = [cluster[0] for cluster in clusters if len(cluster) > 0]

# 대표 기사만 포함된 데이터프레임 생성
filtered_data = data.iloc[representative_indices].reset_index(drop=True)
print(len(filtered_data))
filtered_data

79


Unnamed: 0,source,author,title,description,url,urlToImage,time,content,press,content_full,date\r
0,"{'id': None, 'name': 'OilPrice.com'}",Simon Watkins,Could Energy Ties Bring Saudi Arabia's Conflic...,The relationship between the two great indigen...,https://oilprice.com/Geopolitics/Middle-East/C...,https://d32r1sh890xpii.cloudfront.net/article/...,2021-06-15 00:00:00+00:00,Saudi Arabia is sidelining itself…\r\nIran is ...,OilPrice.com,The relationship between the two great indigen...,2021-06-15\r
1,"{'id': None, 'name': 'Bitcoinist'}",Samuel Wan,The World Economic Forum’s Cryptocurrency Guid...,The World Economic Forum (WEF) has published i...,https://bitcoinist.com/the-world-economic-foru...,https://bitcoinist.com/wp-content/uploads/2021...,2021-06-15 00:00:57+00:00,The World Economic Forum (WEF) has published i...,Bitcoinist,The World Economic Forum (WEF) has published i...,2021-06-15\r
2,"{'id': None, 'name': 'Seeking Alpha'}",SA Transcripts,The Walt Disney Company's (DIS) CEO Bob Chapek...,The Walt Disney Company (NYSE:DIS) Credit Suis...,https://seekingalpha.com/article/4434827-walt-...,https://static.seekingalpha.com/assets/og_imag...,2021-06-15 00:01:09+00:00,The Walt Disney Company (NYSE:DIS) Credit Suis...,Seeking Alpha,The Walt Disney Company (NYSE:DIS) Credit Suis...,2021-06-15\r
3,"{'id': None, 'name': 'Activistpost.com'}",Activist Post,Millions of Sleep Apnea Machines and Ventilato...,By B.N. Frank Sleep apnea is common in the U.S...,https://www.activistpost.com/2021/06/millions-...,https://www.activistpost.com/wp-content/upload...,2021-06-15 00:37:15+00:00,By B.N. Frank\r\nSleep apnea is common in the ...,Activistpost.com,By B.N. Frank Sleep apnea is common in the U.S...,2021-06-15\r
4,"{'id': None, 'name': 'Wccftech'}",Evan Federowicz,ASUS Launches Its RTX 30 Series GPUs with Lite...,ASUS has added new models to its RTX 30 series...,https://wccftech.com/asus-launches-its-rtx-30-...,https://cdn.wccftech.com/wp-content/uploads/20...,2021-06-15 00:39:11+00:00,ASUS has added new models to its RTX 30 series...,Wccftech,ASUS has added new models to its RTX 30 series...,2021-06-15\r
...,...,...,...,...,...,...,...,...,...,...,...
74,"{'id': 'newsweek', 'name': 'Newsweek'}",Paul Bond,Ivory Hecker Says Fox Station Censored Bitcoin...,"Ivory Hecker, a reporter for Fox at a Houston ...",https://www.newsweek.com/ivory-hecker-says-fox...,https://d.newsweek.com/en/full/1822110/james-o...,2021-06-15 22:01:48+00:00,Houston TV reporter Ivory Hecker on Tuesday re...,Newsweek,Houston TV reporter Ivory Hecker released Tues...,2021-06-15\r
75,"{'id': None, 'name': 'CNBC'}",Tanaya Macheel,Crypto asset manager Bitwise raises $70 millio...,Crypto index fund manager Bitwise raised $70 m...,https://www.cnbc.com/2021/06/15/crypto-asset-m...,https://image.cnbcfm.com/api/v1/image/10689088...,2021-06-15 12:41:03+00:00,"Henry Kravis, Stanley Druckenmiller and Bridge...",CNBC,"Henry Kravis, Stanley Druckenmiller and Bridge...",2021-06-15\r
76,"{'id': None, 'name': 'Bitcoinist'}",Bitcoinist,Bitwells Boosts Your ROI in High Volatility,"Bitcoin dropped nearly 35% in May, marking one...",https://bitcoinist.com/bitwells-boosts-your-ro...,https://bitcoinist.com/wp-content/uploads/2021...,2021-06-15 17:14:15+00:00,"Bitcoin dropped nearly 35% in May, marking one...",Bitcoinist,"Bitcoin dropped nearly 35% in May, marking one...",2021-06-15\r
77,"{'id': None, 'name': 'Cointelegraph'}",Cointelegraph By Yashu Gola,"This unknown cryptocurrency soared by 164,842%...",One random cryptocurrency's market valuation j...,https://cointelegraph.com/news/this-unknown-cr...,https://images.cointelegraph.com/images/1200_a...,2021-06-15 07:30:00+00:00,About $7.65 billion entered the cryptocurrency...,Cointelegraph,About $7.65 billion entered the cryptocurrency...,2021-06-15\r


In [70]:
filtered_data.to_csv('filtered_20171104_02.csv', encoding='utf-8-sig', index=False)

In [13]:

# Restricted Damerau-Levenshtein distance 계산 함수
def restricted_damerau_levenshtein(s1: str, s2: str) -> int:
    len_str1 = len(s1)
    len_str2 = len(s2)
    d_matrix = [[0 for _ in range(len_str2 + 1)] for _ in range(len_str1 + 1)]

    for i in range(len_str1 + 1):
        d_matrix[i][0] = i
    for j in range(len_str2 + 1):
        d_matrix[0][j] = j

    for i in range(1, len_str1 + 1):
        for j in range(1, len_str2 + 1):
            cost = 0 if s1[i - 1] == s2[j - 1] else 1
            d_matrix[i][j] = min(
                d_matrix[i - 1][j] + 1,  # Deletion
                d_matrix[i][j - 1] + 1,  # Insertion
                d_matrix[i - 1][j - 1] + cost  # Substitution
            )
            if i > 1 and j > 1 and s1[i - 1] == s2[j - 2] and s1[i - 2] == s2[j - 1]:
                d_matrix[i][j] = min(
                    d_matrix[i][j],
                    d_matrix[i - 2][j - 2] + cost  # Transposition
                )
    return d_matrix[len_str1][len_str2]

# Restricted Damerau-Levenshtein distance 기반 유사도 계산 함수
def similarity_rdl(s1: str, s2: str) -> float:
    distance = restricted_damerau_levenshtein(s1, s2)
    max_len = max(len(s1), len(s2))
    similarity = 1 - (distance / max_len)
    return similarity

In [17]:
import numpy as np
import pandas as pd

# 임시 데이터 생성 (예제 목적)
data = pd.read_csv(r'C:\Users\user\boyu571_github\01_Kakaobank_SKKU_Research_23\eng_20.csv', lineterminator='\n')

# Restricted Damerau-Levenshtein distance 기반 유사도 계산 함수
def similarity_rdl(s1: str, s2: str) -> float:
    distance = restricted_damerau_levenshtein(s1, s2)
    # 두 문자열 중 더 긴 쪽의 길이로 정규화
    max_len = max(len(s1), len(s2))
    # 유사도는 1에서 정규화된 거리를 뺀 값으로 정의 (유사도가 높을수록 거리는 낮음)
    similarity = 1 - (distance / max_len)
    return similarity

# 유사도 행렬 초기화
similarity_matrix = np.zeros((len(data), len(data)))

# 모든 기사 쌍에 대해 유사도 계산
for i in range(len(data)):
    for j in range(len(data)):
        similarity_matrix[i][j] = similarity_rdl(data['content_full'][i], data['content_full'][j])

# 유사도가 0.6 이상인 기사들을 군집화
threshold = 0.6
clusters = []
visited = np.zeros(len(data), dtype=bool)
for i in range(len(data)):
    if not visited[i]:
        # i번째 기사와 유사도가 threshold 이상인 기사들의 인덱스를 찾음
        similar_articles_indices = np.where(similarity_matrix[i] >= threshold)[0]
        clusters.append(similar_articles_indices)
        visited[similar_articles_indices] = True

# 각 군집에서 대표 기사만 남기기
representative_indices = [cluster[0] for cluster in clusters if len(cluster) > 0]

# 대표 기사만 포함된 데이터프레임 생성
filtered_data = data.iloc[representative_indices].reset_index(drop=True)

# 결과 출력
len(filtered_data), filtered_data, clusters


KeyboardInterrupt: 