In [1]:
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
from tqdm import tqdm
import random
import os

In [11]:
# 새로 업로드한 HTML 파일을 읽기
with open('Factiva.com.html', 'r', encoding='utf-8') as file:
    new_html_content = file.read()

# HTML 내용을 파싱
soup_new = BeautifulSoup(new_html_content, 'html.parser')

In [12]:
# <p> 태그를 사용해 기사 내용을 찾아보기
paragraphs = soup_new.find_all('p')

# 기사 내용이 포함될 것으로 예상되는 긴 <p> 태그들을 찾기
long_paragraphs = [p for p in paragraphs if len(p.get_text(strip=True)) > 50]

# 확인을 위해 첫 번째로 발견된 긴 <p> 태그의 내용 출력
long_paragraphs[0].get_text(strip=True) if long_paragraphs else "No long paragraphs found"


'Reuters News,   Wednesday,  06  September  2023,  01:19 GMT,  129 Words,  (English), Copyright 2023 Thomson Reuters. All Rights Reserved.(Document LBA0000020230905ej9503js1)'

In [14]:
paragraphs

[<p class="articleMeta" style="font-size: 11px;line-height: 14px;color: #b5b5b5;margin: .4em 0;padding: 0;">Reuters News,   Wednesday,  06  September  2023,  01:19 GMT,  129 Words,  (English), Copyright 2023 Thomson Reuters. All Rights Reserved.<br/>(Document LBA0000020230905ej9503js1)</p>,
 <p class="articleBody" style="font-size: 12px;line-height: 20px;color: #333333;margin: 1em 0;padding: 0;">
             (Reuters) - Crytocurrency exchange Coinbase Global is launching a digital asset lending platform aimed at large institutional investors, a company spokesperson told Reuters on Tuesday.</p>,
 <p class="articleMeta" style="font-size: 11px;line-height: 14px;color: #b5b5b5;margin: .4em 0;padding: 0;">Benzinga.com, Benzinga Neuro,   Wednesday,  06  September  2023,  01:06 GMT,  451 Words,  (English), Copyright 2023. Benzinga.com<br/>(Document BNZNGA0020230906ej960005l)</p>,
 <p class="articleBody" style="font-size: 12px;line-height: 20px;color: #333333;margin: 1em 0;padding: 0;">
     

In [13]:
# 각 <p> 태그의 내용을 분석하여 기사의 제목과 내용 추출
titles_new = []
contents_new = []

for p in long_paragraphs:
    text = p.get_text(strip=True)
    
    # 기사의 제목과 내용을 분리하기 위한 간단한 방법: 
    # 첫 번째 문장을 제목으로 간주하고 나머지를 본문으로 간주
    sentences = text.split('.')
    titles_new.append(sentences[0])
    contents_new.append(".".join(sentences[1:]))

# 결과를 DataFrame 형식으로 변환
df_new = pd.DataFrame({
    'Title': titles_new,
    'Content': contents_new
})

# DataFrame의 첫 5개 행을 출력하여 확인
df_new.head()


Unnamed: 0,Title,Content
0,"Reuters News, Wednesday, 06 September 202...",All Rights Reserved.(Document LBA000002023090...
1,(Reuters) - Crytocurrency exchange Coinbase Gl...,
2,Benzinga,"com, Benzinga Neuro, Wednesday, 06 Septemb..."
3,Cryptocurrency expert Jameson Lopp predicts th...,
4,Benzinga,"com, Mehab Qureshi, Wednesday, 06 Septembe..."


In [2]:
# Read the new text file content
with open("Factiva-07September20231123.txt", "r", encoding="ISO-8859-1") as file:
    new_txt_content = file.read()

In [6]:
# Final code to split the articles based on the "Document XXXX..." pattern, keeping the delimiter at the end,
# and then remove the "Document XXXX..." pattern from the start of each article

# Split the content based on the "Document XXXX..." pattern, keeping the delimiter at the end
articles_with_document_split = re.split(r'(Document \w{8}\d{8}\w{7})', new_txt_content)

# Adjust the split to form articles with the "Document XXXX..." pattern only at the end
adjusted_articles_final = [articles_with_document_split[i+1] + articles_with_document_split[i] 
                          for i in range(0, len(articles_with_document_split)-1, 2)]

# Convert the adjusted articles into a DataFrame
df_adjusted_articles_final = pd.DataFrame(adjusted_articles_final, columns=["Article_Content"])

# Remove the "Document XXXX..." pattern from the start of each article
df_adjusted_articles_final["Article_Content"] = df_adjusted_articles_final["Article_Content"].str.replace(r'^Document \w{8}\d{8}\w{7}', '', regex=True)

df_adjusted_articles_final["Article_Content"] = df_adjusted_articles_final["Article_Content"].str.replace(r'^.*?\n+', '', regex=True)


In [13]:
df_adjusted_articles_final

Unnamed: 0,Article_Content,Title,Content,Publication_Date
0,Business\nDigital currencies: CBA to limit fun...,Business,Digital currencies: CBA to limit fund transfer...,
1,General News\nWinter is here for exchanges\n\n...,General News,Winter is here for exchanges\n\nLex comment \n...,
2,Business\nCBA s crypto block to hit scammers\...,Business,CBA s crypto block to hit scammers\n\n144 wor...,
3,Business\nGreatest innovation ever: AI to slas...,Business,"Greatest innovation ever: AI to slash prices, ...",
4,News\nBanks put the brake on crypto\n\nBen But...,News,Banks put the brake on crypto\n\nBen Butler \n...,
5,Business\nAnimoca to focus on progressive ma...,Business,Animoca to focus on progressive markets \n\...,
6,"Business\nHK, Singapore seek balance on digita...",Business,"HK, Singapore seek balance on digital assets\n...",
7,Front Page Business\nWinklevoss twins look to ...,Front Page Business,Winklevoss twins look to expand their crypto b...,
8,Philippines Government Accredited Company IMPE...,Philippines Government Accredited Company IMPE...,\nCoinstore.com; PR Newswire \n708 words\n8 Ju...,
9,"MIL-OSI USA: Van Hollen, Warren Request DOJ In...","MIL-OSI USA: Van Hollen, Warren Request DOJ In...","\n1,983 words\n9 June 2023\nForeignAffairs.co....",


In [12]:
# Extract title, content, and publication date from each article

# Extract title (assuming title is the first line of the article)
df_adjusted_articles_final['Title'] = df_adjusted_articles_final['Article_Content'].str.split('\n').str[0]

# Extract content (assuming content is everything after the title and before the last line which contains the Document pattern)
df_adjusted_articles_final['Content'] = df_adjusted_articles_final['Article_Content'].apply(lambda x: '\n'.join(x.split('\n')[1:-1]))

# Extract publication date from the "Document XXXX..." pattern
df_adjusted_articles_final['Publication_Date'] = df_adjusted_articles_final['Article_Content'].str.extract(r'Document \w{4}(\d{8})\w{7}')

# Drop the original combined column
df_extracted_data = df_adjusted_articles_final.drop(columns=["Article_Content"])

# Display the first few rows of the extracted DataFrame
df_extracted_data.head()


Unnamed: 0,Title,Content,Publication_Date
0,Business,Digital currencies: CBA to limit fund transfer...,
1,General News,Winter is here for exchanges\n\nLex comment \n...,
2,Business,CBA s crypto block to hit scammers\n\n144 wor...,
3,Business,"Greatest innovation ever: AI to slash prices, ...",
4,News,Banks put the brake on crypto\n\nBen Butler \n...,
