# SK하이닉스 리더십 뉴스 데이터 기반 LDA 토픽 모델링 분석

In [None]:
!pip install konlpy
!pip install gensim

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
df = pd.read_csv('leadership_news.csv')
documents = df['content'].dropna().tolist()

## 도메인 키워드 및 불용어 정의

In [None]:
domain_keywords = set([
    'HBM', 'HBM3', 'DDR5', 'DDR4', '낸드플래시', 'D램', 'DRAM',
    'AI', 'AI 리더십', 'AI 생태계', 'AI 메모리',
    'SKMS', '리더십', '조직문화', '지속가능경영', '고객 중심', '기술 리더십'
])

stopwords = ['하다', '되다', '있다', '없다', '위해', '대한', '그리고', '또한', '이런', '이번', '통해', '적용', '포함']

In [None]:
from konlpy.tag import Okt
okt = Okt()

def preprocess(doc):
    tokens = okt.nouns(doc)
    return [t for t in tokens if (t in domain_keywords or (len(t) > 1 and t not in stopwords))]

texts = [preprocess(doc) for doc in documents]

In [None]:
from gensim import corpora, models

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_model = models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10)
topics = lda_model.print_topics(num_words=5)

for topic in topics:
    print(topic)