# 월별 데이터 수집

In [4]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
import re
from tqdm import tqdm
import random
import os
import calendar
from datetime import timedelta, date

# # 열의 최대 너비 설정
# pd.set_option('display.max_colwidth', None)

# # 최대 표시 행 수 설정
# pd.set_option('display.max_rows', None)

pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')
api_key = "64a1bdc693984439b9e3a23e58ea5162"

In [18]:
def get_news(api_key, start_date, end_date, query="virtual asset", language="en", page_size=100, max_pages=1000, delay=2):
    base_url = "https://newsapi.org/v2/everything"
    all_articles = []

    current_date = start_date
    while current_date <= end_date:
        total_results = None
        for page in range(1, max_pages + 1):
            parameters = {
                "q": query,
                "language": language,
                "pageSize": page_size,
                "page": page,
                "apiKey": api_key,
                "from": current_date,
                "to": current_date,
                "sortBy": "publishedAt"
            }
            response = requests.get(base_url, params=parameters)
            if response.status_code == 200:
                if total_results is None:  # For the first page of each day
                    total_results = response.json().get('totalResults', 0)
                    max_pages_for_day = min(max_pages, (total_results - 1) // page_size + 1)  # Calculate max pages for the day

                articles = response.json().get('articles', [])
                all_articles.extend(articles)

                if page >= max_pages_for_day:  # Exit loop if we've fetched all available pages for the day
                    break
            else:
                print(f"Error on {current_date} page {page}: {response.status_code}")

            if page != max_pages:  # Avoid sleeping on the last page
                time.sleep(delay)

        # Move to the next day
        current_date += timedelta(days=1)
    
    return {"articles": all_articles}

def convert_to_dataframe(news_data):
    df = pd.DataFrame(news_data['articles'])
    
    # Convert 'source' column to string to avoid "unhashable type: 'dict'" error
    df['source'] = df['source'].astype(str)
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Remove rows with '[Removed]' or 'https://removed.com' or date '1970-01-01T00:00:00Z'
    removal_conditions = (df['publishedAt'] == '1970-01-01T00:00:00Z')
    df = df[~removal_conditions]
    
    # Sort by 'publishedAt' column in ascending order
    df = df.sort_values(by='publishedAt', ascending=True).reset_index(drop=True)
    
    return df



def get_month_range(year, month):
    last_day = calendar.monthrange(year, month)[1]
    start_date = date(year, month, 1)
    end_date = date(year, month, last_day)
    return start_date, end_date

In [25]:
# Get input from the user
# year = int(input("Enter the year (e.g. 2023): "))
year = 2017
month = int(input("Enter the month (1-12): "))
start_date, end_date = get_month_range(year, month)
news_data = get_news(api_key, start_date, end_date)

In [26]:
news_data

{'articles': [{'source': {'id': None, 'name': 'Abajournal.com'},
   'author': 'Jason Tashea',
   'title': 'US regulator to allow bitcoin futures trading for the first time',
   'description': 'The U.S. Commodity Futures Trading Commission will allow futures trading of bitcoin.In a &lt;a href=&quot;http://www.cftc.gov/PressRoom/PressReleases/pr7654-17&quot;&gt;press release&lt;/a&gt; issued Friday, the derivatives regulator cleared the way for the C…',
   'url': 'http://www.abajournal.com/news/article/us_regulator_allows_for_bitcoin_futures_trading_for_the_first_time?utm_source=feedburner&utm_medium=feed&utm_campaign=ABA+Journal+Top+Stories#When:20:40:00Z',
   'urlToImage': 'http://www.abajournal.com/images/main_images/bitcoin2.jpg',
   'publishedAt': '2017-12-01T20:40:00Z',
   'content': 'Home Daily News US regulator to allow bitcoin futures trading… By Jason Tashea Posted December 1, 2017, 2:40 pm CST Shutterstock.com The U.S. Commodity Futures Trading Commission will allow futures t…

In [27]:
df = convert_to_dataframe(news_data)
print(len(df))
df

1003


Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': None, 'name': 'JD Supra'}",,The Numbers Don’t Lie: The SEC Pursues a More ...,"[co-author: Kate Ross, Law Clerk*] One of the ...",http://www.jdsupra.com/legalnews/the-numbers-d...,https://www.jdsupra.com/showThumbnail.aspx?img...,2017-12-01T00:15:00Z,"[co-author: Kate Ross, Law Clerk*] One of the ..."
1,"{'id': None, 'name': 'Geospatial World'}",,Top eight disruptive technologies and how they...,We list the top eight disruptive technologies ...,https://www.geospatialworld.net/blogs/top-disr...,https://geospatialmedia.s3.amazonaws.com/wp-co...,2017-12-01T00:30:00Z,The fourth industrial revolution and oncoming ...
2,"{'id': None, 'name': 'Free Malaysia Today'}",AFP,Ever-volatile bitcoin is embraced by Wall Street,"The exchanges will trade bitcoin derivatives, ...",http://www.freemalaysiatoday.com/category/busi...,http://s3media.freemalaysiatoday.com/wp-conten...,2017-12-01T00:30:22Z,"The exchanges will trade bitcoin derivatives, ..."
3,"{'id': None, 'name': 'Sys-con.com'}",,Strong M&A Activity Growth Forecast for Asia P...,read more,http://news.sys-con.com/node/4204609,,2017-12-01T01:00:00Z,"SYDNEY, AUSTRALIA -- (Marketwired) -- 11/30/17..."
4,"{'id': None, 'name': 'Bitcoinist'}",Bitcoinist.net,Fintech Innovator Humaniq and Regtech Leader C...,"London, United Kingdom – November 30, 2017 – H...",http://bitcoinist.com/fintech-innovator-humani...,http://bitcoinist.com/wp-content/uploads/2017/...,2017-12-01T02:15:30Z,"Bitcoinist.net · November 30, 2017 · 9:15 pm L..."
...,...,...,...,...,...,...,...,...
998,"{'id': 'crypto-coins-news', 'name': 'Crypto Co...",Lester Coleman,2017 Review: The Year Cryptocurrencies Seized ...,The post 2017 Review: The Year Cryptocurrencie...,https://www.ccn.com/2017-crypto-surges-forks-r...,https://www.ccn.com/wp-content/uploads/2017/12...,2017-12-31T13:36:04Z,Get Trading Recommendations and Read Analysis ...
999,"{'id': None, 'name': 'Security Boulevard'}",Jon Griffin,SaaS Domain Controller,<p>As more IT management solutions move to the...,https://securityboulevard.com/2017/12/saas-dom...,https://jumpcloud.com/blog/wp-content/uploads/...,2017-12-31T15:00:04Z,As more IT management solutions move to the cl...
1000,"{'id': None, 'name': 'ValueWalk'}",marcuss,Why the pundits are afraid of bitcoin,It seems like every day another financial lumi...,http://www.valuewalk.com/2017/12/pundits-afrai...,http://www.valuewalk.com/wp-content/uploads/20...,2017-12-31T17:44:33Z,It seems like every day another financial lumi...
1001,"{'id': None, 'name': 'Independent.ie'}",Adrian Weckler,Guide to top tech trends in 2018,1 Bitcoin - a year of sticking or twisting,https://www.independent.ie/business/technology...,https://www.independent.ie/business/technology...,2017-12-31T18:00:40Z,"At the beginning of 2017, Bitcoin and cryptocu..."


In [28]:
filename = f"C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_{year}_{month:02}.csv"
df.to_csv(filename, encoding='utf-8-sig', index=False)

# 연도별 데이터 수집

In [29]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
import re
from tqdm import tqdm
import random
import os
import calendar
from datetime import timedelta, date

# # 열의 최대 너비 설정
# pd.set_option('display.max_colwidth', None)

# # 최대 표시 행 수 설정
# pd.set_option('display.max_rows', None)

pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')
api_key = "64a1bdc693984439b9e3a23e58ea5162"

def get_news(api_key, start_date, end_date, query="virtual asset", language="en", page_size=100, max_pages=1000, delay=2):
    base_url = "https://newsapi.org/v2/everything"
    all_articles = []

    current_date = start_date
    while current_date <= end_date:
        total_results = None
        for page in range(1, max_pages + 1):
            parameters = {
                "q": query,
                "language": language,
                "pageSize": page_size,
                "page": page,
                "apiKey": api_key,
                "from": current_date,
                "to": current_date,
                "sortBy": "publishedAt"
            }
            response = requests.get(base_url, params=parameters)
            if response.status_code == 200:
                if total_results is None:  # For the first page of each day
                    total_results = response.json().get('totalResults', 0)
                    max_pages_for_day = min(max_pages, (total_results - 1) // page_size + 1)  # Calculate max pages for the day

                articles = response.json().get('articles', [])
                all_articles.extend(articles)

                if page >= max_pages_for_day:  # Exit loop if we've fetched all available pages for the day
                    break
            else:
                print(f"Error on {current_date} page {page}: {response.status_code}")

            if page != max_pages:  # Avoid sleeping on the last page
                time.sleep(delay)

        # Move to the next day
        current_date += timedelta(days=1)
    
    return {"articles": all_articles}

def convert_to_dataframe(news_data):
    df = pd.DataFrame(news_data['articles'])
    
    # Convert 'source' column to string to avoid "unhashable type: 'dict'" error
    df['source'] = df['source'].astype(str)
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Remove rows with '[Removed]' or 'https://removed.com' or date '1970-01-01T00:00:00Z'
    removal_conditions = (df['publishedAt'] == '1970-01-01T00:00:00Z')
    df = df[~removal_conditions]
    
    # Sort by 'publishedAt' column in ascending order
    df = df.sort_values(by='publishedAt', ascending=True).reset_index(drop=True)
    
    return df


def get_month_range(year, month):
    last_day = calendar.monthrange(year, month)[1]
    start_date = date(year, month, 1)
    end_date = date(year, month, last_day)
    return start_date, end_date

In [31]:
# Get input from the user for the year
year = int(input("Enter the year (e.g. 2023): "))

# Loop through each month for the given year
for month in range(1, 13):
    start_date, end_date = get_month_range(year, month)
    news_data = get_news(api_key, start_date, end_date)

    df = convert_to_dataframe(news_data)

    filename = f"C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_{year}_{month:02}.csv"
    df.to_csv(filename, encoding='utf-8-sig', index=False)
    print(f"Saved data for {year}-{month:02} to {filename}")

Saved data for 2023-01 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_01.csv
Saved data for 2023-02 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_02.csv
Saved data for 2023-03 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_03.csv
Saved data for 2023-04 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_04.csv
Saved data for 2023-05 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_05.csv
Saved data for 2023-06 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_06.csv
Saved data for 2023-07 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_07.csv
Saved data for 2023-08 to C:/Users/boyu57

KeyboardInterrupt: 

In [30]:
# Loop through each month for the given year
for year in range(2018, 2023):
    for month in range(1, 13):
        start_date, end_date = get_month_range(year, month)
        news_data = get_news(api_key, start_date, end_date)

        df = convert_to_dataframe(news_data)

        filename = f"C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_{year}_{month:02}.csv"
        df.to_csv(filename, encoding='utf-8-sig', index=False)
        print(f"Saved data for {year}-{month:02} to {filename}")

Saved data for 2018-01 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_01.csv
Saved data for 2018-02 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_02.csv
Saved data for 2018-03 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_03.csv
Saved data for 2018-04 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_04.csv
Saved data for 2018-05 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_05.csv
Saved data for 2018-06 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_06.csv
Saved data for 2018-07 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_07.csv
Saved data for 2018-08 to C:/Users/boyu57