# 월별 데이터 수집

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
import re
from tqdm import tqdm
import random
import os
import calendar
from datetime import timedelta, date

# # 열의 최대 너비 설정
# pd.set_option('display.max_colwidth', None)

# # 최대 표시 행 수 설정
# pd.set_option('display.max_rows', None)

pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')
api_key = "64a1bdc693984439b9e3a23e58ea5162"

In [5]:
def get_news(api_key, start_date, end_date, query="virtual asset", language="en", page_size=100, max_pages=1000, delay=2):
    base_url = "https://newsapi.org/v2/everything"
    all_articles = []

    current_date = start_date
    while current_date <= end_date:
        total_results = None
        for page in range(1, max_pages + 1):
            parameters = {
                "q": query,
                "language": language,
                "pageSize": page_size,
                "page": page,
                "apiKey": api_key,
                "from": current_date,
                "to": current_date,
                "sortBy": "publishedAt"
            }
            response = requests.get(base_url, params=parameters)
            if response.status_code == 200:
                if total_results is None:  # For the first page of each day
                    total_results = response.json().get('totalResults', 0)
                    max_pages_for_day = min(max_pages, (total_results - 1) // page_size + 1)  # Calculate max pages for the day

                articles = response.json().get('articles', [])
                all_articles.extend(articles)

                if page >= max_pages_for_day:  # Exit loop if we've fetched all available pages for the day
                    break
            else:
                print(f"Error on {current_date} page {page}: {response.status_code}")

            if page != max_pages:  # Avoid sleeping on the last page
                time.sleep(delay)

        # Move to the next day
        current_date += timedelta(days=1)
    
    return {"articles": all_articles}

def convert_to_dataframe(news_data):
    df = pd.DataFrame(news_data['articles'])
    
    # Convert 'source' column to string to avoid "unhashable type: 'dict'" error
    df['source'] = df['source'].astype(str)
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Remove rows with '[Removed]' or 'https://removed.com' or date '1970-01-01T00:00:00Z'
    removal_conditions = (df['publishedAt'] == '1970-01-01T00:00:00Z')
    df = df[~removal_conditions]
    
    # Sort by 'publishedAt' column in ascending order
    df = df.sort_values(by='publishedAt', ascending=True).reset_index(drop=True)
    
    return df



def get_month_range(year, month):
    last_day = calendar.monthrange(year, month)[1]
    start_date = date(year, month, 1)
    end_date = date(year, month, last_day)
    return start_date, end_date

NameError: name 'query_str' is not defined

In [3]:
# Get input from the user
# year = int(input("Enter the year (e.g. 2023): "))
year = 2023
month = int(input("Enter the month (1-12): "))
start_date, end_date = get_month_range(year, month)
news_data = get_news(api_key, start_date, end_date)

In [4]:
news_data

{'articles': [{'source': {'id': None, 'name': 'Tech Xplore'},
   'author': 'Lucie LEQUIER with Thomas URBAIN in New York',
   'title': 'Crypto industry grapples with FTX fallout as trial looms',
   'description': 'The collapse of cryptocurrency platform FTX, whose disgraced former boss goes on trial this week, sparked shock waves worldwide, with regulators still seeking to get to grips with the sector.',
   'url': 'https://techxplore.com/news/2023-10-crypto-industry-grapples-ftx-fallout.html',
   'urlToImage': 'https://scx2.b-cdn.net/gfx/news/2023/as-former-ftx-chief-sa.jpg',
   'publishedAt': '2023-10-01T19:53:11Z',
   'content': 'The collapse of cryptocurrency platform FTX, whose disgraced former boss goes on trial this week, sparked shock waves worldwide, with regulators still seeking to get to grips with the sector.\r\nSam Ba… [+4011 chars]'},
  {'source': {'id': None, 'name': 'The Conversation Africa'},
   'author': 'John Hawkins, Senior Lecturer, Canberra School of Politics, Econ

In [6]:
df = convert_to_dataframe(news_data)
print(len(df))
df

502


Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': None, 'name': 'Screen Rant'}",Stephen M. Colbert,How Hollywood Finally Providing Streaming Data...,The new agreement between Hollywood studios an...,https://screenrant.com/movie-streaming-data-tr...,https://static1.srcdn.com/wordpress/wp-content...,2023-10-01T02:00:27Z,Summary\r\n<ul><li> The new WGA basic agreemen...
1,"{'id': 'the-times-of-india', 'name': 'The Time...",ET CONTRIBUTORS,The future of cryptocurrencies and its legalis...,A report from Statista shows that the Indian c...,https://economictimes.indiatimes.com/markets/c...,"https://img.etimg.com/thumb/msid-104081414,wid...",2023-10-01T05:50:55Z,We live in a world where for as long as memory...
2,"{'id': None, 'name': 'AllAfrica - Top Africa N...",,Agencies Call for Specialised Courts for Drug ...,[Leadership] The National Drug Law Enforcement...,https://allafrica.com/stories/202310010048.html,https://cdn06.allafrica.com/download/pic/main/...,2023-10-01T07:29:09Z,The National Drug Law Enforcement Agencies (ND...
3,"{'id': None, 'name': 'Cgpersia.com'}",Diptra,Autodesk 3ds Max Bundle 1 Oct 2023,Autodesk 3ds Max Bundle 1 Oct 2023 Title: Auto...,https://cgpersia.com/2023/10/autodesk-3ds-max-...,,2023-10-01T08:12:18Z,Autodesk 3ds Max Bundle 1 Oct 2023\r\nInfo:\r\...
4,"{'id': None, 'name': 'International Business T...",Lucie LEQUIER with Thomas URBAIN in New York,Crypto Industry Grapples With FTX Fallout As T...,"The collapse of cryptocurrency platform FTX, w...",https://www.ibtimes.com/crypto-industry-grappl...,https://d.ibtimes.com/en/full/4489759/former-f...,2023-10-01T12:27:29Z,"The collapse of cryptocurrency platform FTX, w..."
...,...,...,...,...,...,...,...,...
497,"{'id': None, 'name': 'Marketscreener.com'}",,PROG Holdings Beats Third Quarter 2023 Expecta...,(marketscreener.com) \n\nConsolidated revenues...,https://www.marketscreener.com/quote/stock/PRO...,https://www.marketscreener.com/images/twitter_...,2023-10-25T11:02:02Z,Consolidated revenues of $582.9 million\r\nEar...
498,"{'id': None, 'name': 'Unity.com'}",,Water Fountain - [Asset for Zibra Liquid] by Z...,Water fountain is an environmental 3D asset cr...,https://assetstore.unity.com/packages/3d/envir...,https://assetstorev1-prd-cdn.unity3d.com/key-i...,2023-10-25T11:15:18Z,"Over 11,000 five-star assets\r\nRated by 85,00..."
499,"{'id': None, 'name': 'Unity.com'}",,Liquid Elemental - [Asset for Zibra Liquid] by...,Liquid Elemental is an interactive 3D characte...,https://assetstore.unity.com/packages/3d/chara...,https://assetstorev1-prd-cdn.unity3d.com/key-i...,2023-10-25T11:15:26Z,"Over 11,000 five-star assets\r\nRated by 85,00..."
500,"{'id': None, 'name': 'GlobeNewswire'}",Shibarium,SHIB Partners with RSTLSS to Support UGC for G...,RSTLSS will leverage Shibarium blockchain’s te...,https://www.globenewswire.com/news-release/202...,https://ml.globenewswire.com/Resource/Download...,2023-10-25T11:50:00Z,"Los Angeles, CA, Oct. 25, 2023 (GLOBE NEWSWIRE..."


In [28]:
filename = f"C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_{year}_{month:02}.csv"
df.to_csv(filename, encoding='utf-8-sig', index=False)

# 연도별 데이터 수집

In [29]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
import re
from tqdm import tqdm
import random
import os
import calendar
from datetime import timedelta, date

# # 열의 최대 너비 설정
# pd.set_option('display.max_colwidth', None)

# # 최대 표시 행 수 설정
# pd.set_option('display.max_rows', None)

pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')
api_key = "64a1bdc693984439b9e3a23e58ea5162"

def get_news(api_key, start_date, end_date, query="virtual asset", language="en", page_size=100, max_pages=1000, delay=2):
    base_url = "https://newsapi.org/v2/everything"
    all_articles = []

    current_date = start_date
    while current_date <= end_date:
        total_results = None
        for page in range(1, max_pages + 1):
            parameters = {
                "q": query,
                "language": language,
                "pageSize": page_size,
                "page": page,
                "apiKey": api_key,
                "from": current_date,
                "to": current_date,
                "sortBy": "publishedAt"
            }
            response = requests.get(base_url, params=parameters)
            if response.status_code == 200:
                if total_results is None:  # For the first page of each day
                    total_results = response.json().get('totalResults', 0)
                    max_pages_for_day = min(max_pages, (total_results - 1) // page_size + 1)  # Calculate max pages for the day

                articles = response.json().get('articles', [])
                all_articles.extend(articles)

                if page >= max_pages_for_day:  # Exit loop if we've fetched all available pages for the day
                    break
            else:
                print(f"Error on {current_date} page {page}: {response.status_code}")

            if page != max_pages:  # Avoid sleeping on the last page
                time.sleep(delay)

        # Move to the next day
        current_date += timedelta(days=1)
    
    return {"articles": all_articles}

def convert_to_dataframe(news_data):
    df = pd.DataFrame(news_data['articles'])
    
    # Convert 'source' column to string to avoid "unhashable type: 'dict'" error
    df['source'] = df['source'].astype(str)
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Remove rows with '[Removed]' or 'https://removed.com' or date '1970-01-01T00:00:00Z'
    removal_conditions = (df['publishedAt'] == '1970-01-01T00:00:00Z')
    df = df[~removal_conditions]
    
    # Sort by 'publishedAt' column in ascending order
    df = df.sort_values(by='publishedAt', ascending=True).reset_index(drop=True)
    
    return df


def get_month_range(year, month):
    last_day = calendar.monthrange(year, month)[1]
    start_date = date(year, month, 1)
    end_date = date(year, month, last_day)
    return start_date, end_date

In [31]:
# Get input from the user for the year
year = int(input("Enter the year (e.g. 2023): "))

# Loop through each month for the given year
for month in range(1, 13):
    start_date, end_date = get_month_range(year, month)
    news_data = get_news(api_key, start_date, end_date)

    df = convert_to_dataframe(news_data)

    filename = f"C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_{year}_{month:02}.csv"
    df.to_csv(filename, encoding='utf-8-sig', index=False)
    print(f"Saved data for {year}-{month:02} to {filename}")

Saved data for 2023-01 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_01.csv
Saved data for 2023-02 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_02.csv
Saved data for 2023-03 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_03.csv
Saved data for 2023-04 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_04.csv
Saved data for 2023-05 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_05.csv
Saved data for 2023-06 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_06.csv
Saved data for 2023-07 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2023_07.csv
Saved data for 2023-08 to C:/Users/boyu57

KeyboardInterrupt: 

In [30]:
# Loop through each month for the given year
for year in range(2018, 2023):
    for month in range(1, 13):
        start_date, end_date = get_month_range(year, month)
        news_data = get_news(api_key, start_date, end_date)

        df = convert_to_dataframe(news_data)

        filename = f"C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_{year}_{month:02}.csv"
        df.to_csv(filename, encoding='utf-8-sig', index=False)
        print(f"Saved data for {year}-{month:02} to {filename}")

Saved data for 2018-01 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_01.csv
Saved data for 2018-02 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_02.csv
Saved data for 2018-03 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_03.csv
Saved data for 2018-04 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_04.csv
Saved data for 2018-05 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_05.csv
Saved data for 2018-06 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_06.csv
Saved data for 2018-07 to C:/Users/boyu571/boyu571_Github/01_Kakaobank_SKKU_Research_23/data/foreign_news_data/virtual_asset_2018_07.csv
Saved data for 2018-08 to C:/Users/boyu57