In [22]:
import pandas as pd

# 파일 경로 지정
file_path = 'C:/textmining.csv'

# 파일 불러오기
data = pd.read_csv(file_path, encoding='utf-8') 
# 'Abstract' 컬럼만 포함하는 새로운 데이터프레임 생성
abstracts_only = data[['Abstract']]

abstracts_only

Unnamed: 0,Abstract
0,Hypertension is a risk factor for coronary art...
1,BACKGROUND:The clinical significance of isolat...
2,Background Smoking is an important cardiovascu...
3,Background-Population studies have demonstrate...
4,Growing research suggests that posttraumatic s...
...,...
583,Although similar to 25% of colorectal cancer o...
584,The genetic determinants of fasting glucose (F...
585,Genetic influences on lipid traits have been s...
586,"Central obesity, measured by waist circumferen..."


In [7]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.5.10-cp310-cp310-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------- 42.0/42.0 kB 2.0 MB/s eta 0:00:00
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------- ----------------------------- 0.4/1.5 MB 8.2 MB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 8.0 MB/s eta 0:00:01
   -------------------------- ------------- 1.0/1.5 MB 7.8 MB/s eta 0:00:01
   ------------------------------------ --- 1.4/1.5 MB 7.4 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 7.4 MB/s eta 0:00:00
Downloading regex-2024.5.10-cp310-cp310-win_amd64.whl (268 kB)
   ------

In [23]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# 필요한 NLTK 리소스 다운로드
import nltk
nltk.download('punkt')
nltk.download('stopwords')

data = pd.read_csv(file_path, encoding='utf-8')  # 파일 불러오기

# 불용어 목록 로드
stop_words = set(stopwords.words('english'))

# 텍스트 전처리 함수 정의
def preprocess_text(text):
    # 소문자 변환
    text = text.lower()
    # 숫자 및 특수 문자 제거
    text = re.sub(r'\d+', '', text)  # 숫자 제거
    text = re.sub(r'[^\w\s]', '', text)  # 특수 문자 제거
    # 토큰화
    tokens = word_tokenize(text)
    # 불용어 제거 및 길이가 1 이상인 단어만 포함
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    return ' '.join(tokens)

# 'Abstract' 컬럼 전처리
data['Abstract'] = data['Abstract'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\문창원\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\문창원\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
data[['Abstract']]

Unnamed: 0,Abstract
0,hypertension risk factor coronary artery disea...
1,backgroundthe clinical significance isolated d...
2,background smoking important cardiovascular di...
3,backgroundpopulation studies demonstrated impo...
4,growing research suggests posttraumatic stress...
...,...
583,although similar colorectal cancer polyp crcp ...
584,genetic determinants fasting glucose fg fastin...
585,genetic influences lipid traits suggested nume...
586,central obesity measured waist circumference w...


In [28]:
output_file_path = 'C:/textmining.csv'  # 저장할 파일의 경로 지정
data.to_csv(output_file_path, index=False, encoding='utf-8')  # 인덱스를 포함하지 않고 저장

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


data = pd.read_csv('C:/textmining.csv')  # 전처리된 파일 경로

# 문서-단어 행렬 생성
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(data['Abstract'])  # 'Abstract' 컬럼에 대해 DTM 생성

# LDA 모델 설정 및 학습
lda = LatentDirichletAllocation(n_components=10, random_state=0)  # 7개의 토픽 추출
lda.fit(dtm)

# 각 토픽의 상위 단어 출력
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx}: ", end='')
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

n_top_words = 10
feature_names = vectorizer.get_feature_names_out()
print_top_words(lda, feature_names, n_top_words)

Topic #0: hypertension risk disease blood factors pressure high genetic cardiovascular cvd
Topic #1: disease genetic vascular sbp traits heritability pressure blood environmental factors
Topic #2: risk associated cardiovascular higher cholesterol ci disease pressure levels age
Topic #3: blood levels associated plasma pressure dna methylation concentrations heritability loci
Topic #4: genetic association genes loci identified genomewide studies variants linkage traits
Topic #5: risk factors disease cvd heart cardiovascular study obesity diabetes health
Topic #6: pressure blood associated genetic hf dbp systolic cardiovascular disease sbp
Topic #7: risk patients polymorphism allele subjects higher associated gene study genotype
Topic #8: bp variants rs hypertension study genetic associated rare eh using
Topic #9: associated disease bp risk genetic hypertension association blood variants chd
