# 월별 데이터 수집

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
import re
from tqdm import tqdm
import random
import os
import calendar
from datetime import timedelta, date

# # 열의 최대 너비 설정
# pd.set_option('display.max_colwidth', None)

# # 최대 표시 행 수 설정
# pd.set_option('display.max_rows', None)

pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')
api_key = "64a1bdc693984439b9e3a23e58ea5162"

In [14]:
def get_news(api_key, start_date, end_date, query="bitcoin", language="en", page_size=100, max_pages=1000, delay=2):
    base_url = "https://newsapi.org/v2/everything"
    all_articles = []

    current_date = start_date
    while current_date <= end_date:
        total_results = None
        for page in range(1, max_pages + 1):
            parameters = {
                "q": query,
                "language": language,
                "pageSize": page_size,
                "page": page,
                "apiKey": api_key,
                "from": current_date,
                "to": current_date,
                "sortBy": "publishedAt"
            }
            response = requests.get(base_url, params=parameters)
            if response.status_code == 200:
                if total_results is None:  # For the first page of each day
                    total_results = response.json().get('totalResults', 0)
                    max_pages_for_day = min(max_pages, (total_results - 1) // page_size + 1)  # Calculate max pages for the day

                articles = response.json().get('articles', [])
                all_articles.extend(articles)

                if page >= max_pages_for_day:  # Exit loop if we've fetched all available pages for the day
                    break
            else:
                print(f"Error on {current_date} page {page}: {response.status_code}")

            if page != max_pages:  # Avoid sleeping on the last page
                time.sleep(delay)

        # Move to the next day
        current_date += timedelta(days=1)
    
    return {"articles": all_articles}

def convert_to_dataframe(news_data):
    df = pd.DataFrame(news_data['articles'])
    
    # Convert 'source' column to string to avoid "unhashable type: 'dict'" error
    df['source'] = df['source'].astype(str)
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Remove rows with '[Removed]' or 'https://removed.com' or date '1970-01-01T00:00:00Z'
    removal_conditions = (df['publishedAt'] == '1970-01-01T00:00:00Z')
    df = df[~removal_conditions]
    
    # Sort by 'publishedAt' column in ascending order
    df = df.sort_values(by='publishedAt', ascending=True).reset_index(drop=True)
    
    return df



def get_month_range(year, month):
    last_day = calendar.monthrange(year, month)[1]
    start_date = date(year, month, 1)
    end_date = date(year, month, last_day)
    return start_date, end_date

In [15]:
# Get input from the user
# year = int(input("Enter the year (e.g. 2023): "))
year = 2023
month = int(input("Enter the month (1-12): "))
start_date, end_date = get_month_range(year, month)
news_data = get_news(api_key, start_date, end_date)

In [16]:
news_data

{'articles': [{'source': {'id': None, 'name': 'Biztoc.com'},
   'author': 'nobsbitcoin.com',
   'title': "How Bitcoin Blockchain Is Fighting Fraud in Guatemala's Presidential Elections",
   'description': 'How Bitcoin Blockchain Is Fighting Fraud in Guatemala\'s Presidential Elections "Thanks to OpenTimestamps, a tool created by bitcoin developer Peter Todd a few years ago, Guatemalan tech startup Simple Proof is able to safeguard key documents about the country…',
   'url': 'https://biztoc.com/x/70ef4bece7097c2e',
   'urlToImage': 'https://c.biztoc.com/p/70ef4bece7097c2e/s.webp',
   'publishedAt': '2023-10-01T23:12:10Z',
   'content': 'How Bitcoin Blockchain Is Fighting Fraud in Guatemala\'s Presidential Elections"Thanks to OpenTimestamps, a tool created by bitcoin developer Peter Todd a few years ago, Guatemalan tech startup Simple… [+212 chars]'},
  {'source': {'id': None, 'name': 'Biztoc.com'},
   'author': 'en.bitcoinsistemi.com',
   'title': 'Sudden Rise in Cryptocurrencies: Bitc

In [17]:
df = convert_to_dataframe(news_data)
print(len(df))
df

7770


Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': None, 'name': 'Biztoc.com'}",news.bitcoin.com,Robert Kiyosaki Expects Bitcoin to ‘Become Pri...,Rich Dad Poor Dad author Robert Kiyosaki has p...,https://biztoc.com/x/b5958e9eb152890f,https://c.biztoc.com/p/b5958e9eb152890f/s.webp,2023-10-01T00:10:08Z,Rich Dad Poor Dad author Robert Kiyosaki has p...
1,"{'id': None, 'name': 'Investing.com'}",Cointelegraph,Here’s what happened in crypto today,Here’s what happened in crypto today,https://www.investing.com/news/cryptocurrency-...,https://i-invdn-com.investing.com/news/Cryptoc...,2023-10-01T01:20:17Z,Three Arrow Capital's Su Zhu was arrested in S...
2,"{'id': None, 'name': 'Biztoc.com'}",benzinga.com,"Crypto This Week: Shiba Inu Burn Rate, Rags To...",What a whirlwind of a week in the world of cry...,https://biztoc.com/x/aa13bfbadc3ac2d9,https://c.biztoc.com/p/aa13bfbadc3ac2d9/s.webp,2023-10-01T02:00:05Z,What a whirlwind of a week in the world of cry...
3,"{'id': None, 'name': 'Biztoc.com'}",news.bitcoin.com,Former US President Donald Trump May Change Cr...,The U.S. Securities and Exchange Commission’s ...,https://biztoc.com/x/7640f4425b8d06e9,https://c.biztoc.com/p/7640f4425b8d06e9/s.webp,2023-10-01T02:16:07Z,The U.S. Securities and Exchange Commissions f...
4,"{'id': None, 'name': 'Forbes'}","Max (Chong) Li, Contributor, \n Max (Chong) Li...",Why Blockchain Is Necessary In Decentralized C...,"In an era dominated by centralized entities, d...",https://www.forbes.com/sites/digital-assets/20...,https://imageio.forbes.com/specials-images/ima...,2023-10-01T03:02:11Z,Employee checking a server rack at the new dat...
...,...,...,...,...,...,...,...,...
7765,"{'id': None, 'name': 'CryptoSlate'}",James Van Straten,Digital asset market cap surges by $250 billio...,"Quick Take The ‘Magnificent 7’, a moniker for ...",https://cryptoslate.com/insights/crypto-market...,https://cryptoslate.com/wp-content/uploads/202...,2023-10-26T10:00:21Z,"The ‘Magnificent 7’, a moniker for the top 7 t..."
7766,"{'id': None, 'name': 'Biztoc.com'}",news.bitcoin.com,Galaxy Expects Spot Bitcoin ETF to Push BTC Up...,A spot-based bitcoin exchange-traded fund (ETF...,https://biztoc.com/x/08ebf9426777d73a,https://c.biztoc.com/p/08ebf9426777d73a/s.webp,2023-10-26T10:06:06Z,A spot-based bitcoin exchange-traded fund (ETF...
7767,"{'id': None, 'name': 'Biztoc.com'}",coindesk.com,Bitcoin’s Recent Outperformance Fueled by Inst...,Optimism about the approval of a spot bitcoin ...,https://biztoc.com/x/5631e5a9a6cebe46,https://c.biztoc.com/p/5631e5a9a6cebe46/s.webp,2023-10-26T10:12:05Z,Optimism about the approval of a spot bitcoin ...
7768,"{'id': None, 'name': 'Biztoc.com'}",cryptopotato.com,Here’s When Bitfinex Securities Will List the ...,Bitfinex Securities – the specialized platform...,https://biztoc.com/x/9f677a7e4de73584,https://c.biztoc.com/p/9f677a7e4de73584/s.webp,2023-10-26T10:12:07Z,Bitfinex Securities the specialized platform d...


In [18]:
filename = f"C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/bitcoin_{year}_{month:02}.csv"
df.to_csv(filename, encoding='utf-8-sig', index=False)

# 연도별 데이터 수집

In [29]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
import re
from tqdm import tqdm
import random
import os
import calendar
from datetime import timedelta, date

# # 열의 최대 너비 설정
# pd.set_option('display.max_colwidth', None)

# # 최대 표시 행 수 설정
# pd.set_option('display.max_rows', None)

pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')
api_key = "64a1bdc693984439b9e3a23e58ea5162"

def get_news(api_key, start_date, end_date, query="virtual asset", language="en", page_size=100, max_pages=1000, delay=2):
    base_url = "https://newsapi.org/v2/everything"
    all_articles = []

    current_date = start_date
    while current_date <= end_date:
        total_results = None
        for page in range(1, max_pages + 1):
            parameters = {
                "q": query,
                "language": language,
                "pageSize": page_size,
                "page": page,
                "apiKey": api_key,
                "from": current_date,
                "to": current_date,
                "sortBy": "publishedAt"
            }
            response = requests.get(base_url, params=parameters)
            if response.status_code == 200:
                if total_results is None:  # For the first page of each day
                    total_results = response.json().get('totalResults', 0)
                    max_pages_for_day = min(max_pages, (total_results - 1) // page_size + 1)  # Calculate max pages for the day

                articles = response.json().get('articles', [])
                all_articles.extend(articles)

                if page >= max_pages_for_day:  # Exit loop if we've fetched all available pages for the day
                    break
            else:
                print(f"Error on {current_date} page {page}: {response.status_code}")

            if page != max_pages:  # Avoid sleeping on the last page
                time.sleep(delay)

        # Move to the next day
        current_date += timedelta(days=1)
    
    return {"articles": all_articles}

def convert_to_dataframe(news_data):
    df = pd.DataFrame(news_data['articles'])
    
    # Convert 'source' column to string to avoid "unhashable type: 'dict'" error
    df['source'] = df['source'].astype(str)
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Remove rows with '[Removed]' or 'https://removed.com' or date '1970-01-01T00:00:00Z'
    removal_conditions = (df['publishedAt'] == '1970-01-01T00:00:00Z')
    df = df[~removal_conditions]
    
    # Sort by 'publishedAt' column in ascending order
    df = df.sort_values(by='publishedAt', ascending=True).reset_index(drop=True)
    
    return df


def get_month_range(year, month):
    last_day = calendar.monthrange(year, month)[1]
    start_date = date(year, month, 1)
    end_date = date(year, month, last_day)
    return start_date, end_date

In [31]:
# Get input from the user for the year
year = int(input("Enter the year (e.g. 2023): "))

# Loop through each month for the given year
for month in range(1, 13):
    start_date, end_date = get_month_range(year, month)
    news_data = get_news(api_key, start_date, end_date)

    df = convert_to_dataframe(news_data)

    filename = f"C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_{year}_{month:02}.csv"
    df.to_csv(filename, encoding='utf-8-sig', index=False)
    print(f"Saved data for {year}-{month:02} to {filename}")

Saved data for 2023-01 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_01.csv
Saved data for 2023-02 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_02.csv
Saved data for 2023-03 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_03.csv
Saved data for 2023-04 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_04.csv
Saved data for 2023-05 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_05.csv
Saved data for 2023-06 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_06.csv
Saved data for 2023-07 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_07.csv
Saved data for 2023-08 to C:/Users/boyu57

KeyboardInterrupt: 

In [30]:
# Loop through each month for the given year
for year in range(2018, 2023):
    for month in range(1, 13):
        start_date, end_date = get_month_range(year, month)
        news_data = get_news(api_key, start_date, end_date)

        df = convert_to_dataframe(news_data)

        filename = f"C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_{year}_{month:02}.csv"
        df.to_csv(filename, encoding='utf-8-sig', index=False)
        print(f"Saved data for {year}-{month:02} to {filename}")

Saved data for 2018-01 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_01.csv
Saved data for 2018-02 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_02.csv
Saved data for 2018-03 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_03.csv
Saved data for 2018-04 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_04.csv
Saved data for 2018-05 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_05.csv
Saved data for 2018-06 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_06.csv
Saved data for 2018-07 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_07.csv
Saved data for 2018-08 to C:/Users/boyu57