In [2]:
from infrastructure.database.mongo_client import MongoDBClient
from infrastructure.repository import YoutubeContentRepository, YoutubeKeyPointCollectionRepository
from domain import YouTubeContent
from domain import YoutubeTimelineSummary, YoutubeTimelineSection
from domain import YoutubeKeyPointCollection, YoutubeKeyPoint

import pandas as pd
import json
import os
from dotenv import load_dotenv

load_dotenv()

# MongoDB 클라이언트 초기화
client = MongoDBClient(uri=os.environ['MONGO_CONNECTION_STRING'])
client.connect()

# 저장소 초기화
content_repo = YoutubeContentRepository(client)
keypoint_repo = YoutubeKeyPointCollectionRepository(client)

contents = content_repo.find_all()

# 🔹 전체 데이터를 누적할 리스트
corpus_data = []

keys = []

for content in contents:
    content: YouTubeContent

    if content.timeline_summary is None:
        continue

    collection: YoutubeKeyPointCollection = keypoint_repo.get(content.url.url)

    if collection is None:
        continue


    for key_point in collection.key_points:
        term = key_point.term
        if term in keys:
            continue

        # 🔹 Corpus 데이터에 추가
        corpus_data.append({
            "term": term,
            "description": key_point.description
        })

        keys.append(term)

# 🔹 pandas DataFrame으로 변환
df = pd.DataFrame(corpus_data)
print(len(df))

# 🔹 JSON 파일로 저장
output_file = 'corpus2.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(corpus_data, f, ensure_ascii=False, indent=4)


print(f"Corpus JSON 파일이 성공적으로 저장되었습니다.")

Connected to MongoDB
1045
Corpus JSON 파일이 성공적으로 저장되었습니다.
