In [3]:
import os
import time
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

# .env 파일에서 API 키 불러오기
load_dotenv('./data/bunhine.env')
client_id = os.getenv('client_id')
client_secret = os.getenv('client_secret')

# HTTP 요청 헤더 설정
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# 불필요한 텍스트를 제거하기 위한 정규식 패턴
unwanted_text_patterns = [
    "URL 복사",
    "이웃추가",
    "본문 기타 기능",
    "공유하기",
    "신고하기"
]

def clean_text(text):
    for pattern in unwanted_text_patterns:
        text = re.sub(pattern, '', text)
    # 여러 줄바꿈을 하나의 공백으로 치환
    text = re.sub(r'\n+', ' ', text).strip()
    return text

def get_blog_content(url):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        # 네이버 블로그의 경우, iframes 안에 본문이 있기 때문에 이를 처리합니다.
        iframe = soup.find('iframe', {'id': 'mainFrame'})
        if iframe:
            iframe_url = "https://blog.naver.com" + iframe['src']
            iframe_response = requests.get(iframe_url, headers=headers)
            iframe_response.raise_for_status()
            iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
            content = iframe_soup.select("div.se-component, div.postViewArea")
            if content:
                text = "\n".join([part.get_text(separator='\n').strip() for part in content])
                return clean_text(text)
        
        # iframe이 없는 경우에도 대응합니다.
        content = soup.select("div.se-component, div.postViewArea")
        if content:
            text = "\n".join([part.get_text(separator='\n').strip() for part in content])
            return clean_text(text)
        return None
    except Exception as e:
        print(f"Error crawling {url}: {e}")
        return None

def main():
    # 저장된 블로그 URL 데이터를 불러옵니다.
    blog_df = pd.read_csv('./data/naver_blog_starbucks.csv')
    final_data = []
    total_urls = len(blog_df)
    batch_size = 1000

    for start in range(18000, total_urls, batch_size):
        batch_df = blog_df[start:start+batch_size]
        batch_data = []
        for index, row in batch_df.iterrows():
            store_name = row['Store_Name']
            url = row['urls']
            print(f"Crawling content for URL: {url}")
            content = get_blog_content(url)
            if content:
                batch_data.append({'Store_Name': store_name, 'Content': content})
            time.sleep(1)  # 너무 빠른 요청을 피하기 위해 잠시 대기

        # 배치 데이터를 CSV 파일로 저장합니다.
        batch_df = pd.DataFrame(batch_data)
        batch_df.to_csv(f'./data/starbucks_naver_blog_batch_{start//batch_size}.csv', index=False, encoding='utf-8-sig')
        final_data.extend(batch_data)

    # 최종 데이터를 CSV 파일로 저장합니다.
    final_df = pd.DataFrame(final_data)
    combined_df = final_df.groupby('Store_Name')['Content'].apply(' '.join).reset_index()
    combined_df.to_csv('./data/starbucks_naver_blog_crawling.csv', index=False, encoding='utf-8-sig')
    print("Saved combined blog contents to './data/starbucks_naver_blog_crawling.csv'")

if __name__ == '__main__':
    main()

Crawling content for URL: https://blog.naver.com/dewgi11/223345294849
Crawling content for URL: https://blog.naver.com/nms6101/223347254504
Crawling content for URL: https://blog.naver.com/mintchoco1120/223490815745
Crawling content for URL: https://blog.naver.com/k_kkomi/223319517632
Crawling content for URL: https://blog.naver.com/unsinae/223336987994
Crawling content for URL: https://blog.naver.com/zimmyneutron/223326753679
Crawling content for URL: https://blog.naver.com/gamsa505/223459951316
Crawling content for URL: https://blog.naver.com/coffee_1001/223358917085
Crawling content for URL: https://blog.naver.com/ymsmile0502/223475188896
Crawling content for URL: https://blog.naver.com/rivertour/223055724501
Crawling content for URL: https://blog.naver.com/hee_hee_hee/223224876170
Crawling content for URL: https://blog.naver.com/loveiistory/223067550865
Crawling content for URL: https://blog.naver.com/torystory__/223369384530
Crawling content for URL: https://blog.naver.com/jin_dre

In [1]:
import pandas as pd
import glob

# 모든 파일 경로를 읽어옵니다.
file_paths = glob.glob("./data/starbucks_naver_blog_batch_*.csv")

# 모든 파일을 하나의 데이터프레임으로 합칩니다.
df_list = [pd.read_csv(file) for file in file_paths]
combined_df = pd.concat(df_list, ignore_index=True)

# Store_Name이 같은 Content를 하나의 열로 합칩니다.
combined_df['Content'] = combined_df.groupby('Store_Name')['Content'].transform(lambda x: ' '.join(x))
combined_df = combined_df.drop_duplicates(subset=['Store_Name'])

# 합친 데이터를 csv 파일로 저장합니다.
combined_df.to_csv("./data/combined_starbucks_blog_contents.csv", index=False)

print("파일 합치기 및 Content 통합 완료.")


ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.
