<a href="https://colab.research.google.com/github/ascentadmin/lda/blob/main/LDA_1URL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install konlpy
!pip install gensim
!pip install beautifulsoup4
!pip install requests

# SudachiPy와 관련된 라이브러리 설치 (일본어 형태소 분석을 위해)
!pip install sudachipy
!pip install sudachidict_core


Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0
Collecting sudachipy
  Downloading SudachiPy-0.6.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sudachipy
Successfully installed sudachipy-0.6.8
Collecting sudachidict_core
  Downloading SudachiDict_core-20240109-py3-none-any.whl (71.8 MB)
[2K     [90m━━━━━━━━━━━━━

In [None]:
import requests
from bs4 import BeautifulSoup
from konlpy.tag import Okt
from gensim import corpora, models
from sudachipy import tokenizer
from sudachipy import dictionary
from collections import defaultdict

# 불용어 리스트 로드 함수 수정
def load_stopwords(lang_code):
    if lang_code == 'kr':
        filepath = '/content/kr-stopword.txt'
    elif lang_code == 'jp':
        filepath = '/content/jp-stopword.txt'
    else:
        raise ValueError("Unsupported language code")

    with open(filepath, 'r', encoding='utf-8') as file:
        stopwords = file.read().splitlines()
    return stopwords

# 웹페이지 텍스트 추출 함수
def get_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text(separator=' ', strip=True)
    return text

# 형태소 분석 및 불용어 처리 함수
def analyze_text(lang, text, stopwords):
    processed_text = []
    if lang == 'kr':
        okt = Okt()
        tokens = okt.nouns(text)
    elif lang == 'jp':
        sudachi = dictionary.Dictionary().create()
        mode = tokenizer.Tokenizer.SplitMode.C
        tokens = [m.surface() for m in sudachi.tokenize(text, mode)]
    processed_text = [word for word in tokens if word not in stopwords and len(word) > 1] # 길이가 1 이상인 단어만 포함
    return processed_text

# LDA 모델 생성 및 결과 출력 함수, 주요 단어 15개 추출
def create_lda_model(processed_docs, num_topics=15, num_words=25):
    dictionary = corpora.Dictionary(processed_docs)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

    # 단어별 스코어 합산
    word_scores = defaultdict(float)
    for idx in range(num_topics):
        for word, score in lda_model.show_topic(idx, topn=num_words):
            word_scores[word] += score

    # 결과 출력
    for word, score in sorted(word_scores.items(), key=lambda item: item[1], reverse=True):
        print(f"{word}: {score:.3f}")

# 메인 실행
if __name__ == "__main__":
    lang = input("Enter language (kr for Korean, jp for Japanese): ")
    url = input("Enter URL: ")
    text = get_text_from_url(url)
    stopwords = load_stopwords(lang)
    processed_text = analyze_text(lang, text, stopwords)
    processed_docs = [processed_text]
    create_lda_model(processed_docs, num_topics=15, num_words=25)


Enter language (kr for Korean, jp for Japanese): jp
Enter URL: https://sakidori.co/article/2103404
テレビ: 0.071
映像: 0.043
見る: 0.041
おすすめ: 0.040
メーカー: 0.039
モデル: 0.037
搭載: 0.036
液晶: 0.035
おり: 0.032
機能: 0.032
Amazon: 0.030
家電: 0.030
楽天: 0.030
技術: 0.029
有機: 0.029
カメラ: 0.029
EL: 0.028
市場: 0.028
人気: 0.027
対応: 0.027
採用: 0.026
年版: 0.024
2024: 0.023
動画: 0.021
実現: 0.018
画質: 0.006
特徴: 0.006
魅力: 0.005
優れ: 0.003
紹介: 0.003
録画: 0.003
楽しめる: 0.003
再現: 0.002
画面: 0.002
パネル: 0.001
一覧: 0.001
記事: 0.001
