In [2]:
import sys

sys.path.append(r'../util')

In [6]:
import kis_auth as ka
import kis_domstk as kb

import os
import time
import logging
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm

import duckdb

In [None]:
destdir = os.path.join(os.path.expanduser('~'),'data', 'news')  
os.makedirs(destdir, exist_ok=True)

log_file_path = os.path.join(destdir, 'news_processing.log')
logging.basicConfig(filename=log_file_path, level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')


db_path = os.path.join(destdir, 'news_database.duckdb')
table_name = 'news_titles' # Name for your DuckDB table
logging.info(f"Using DuckDB database at: {db_path}, table: {table_name}")

# Set date range
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 3, 23) # Included

# Columns to drop (ensure these are exactly the names in the fetched data)
dropcols = ['iscd2', 'iscd3', 'iscd4', 'iscd5', 'iscd6',
            'iscd7', 'iscd8', 'iscd9', 'iscd10',
            'kor_isnm2', 'kor_isnm3', 'kor_isnm4',
            'kor_isnm5', 'kor_isnm6', 'kor_isnm7',
            'kor_isnm8', 'kor_isnm9', 'kor_isnm10']

dates = [(start_date + timedelta(days=x)).strftime('%Y%m%d') for x in range((end_date - start_date).days + 1)]
dates.sort(reverse=True) # Process most recent dates first

con = None 

try:
    con = duckdb.connect(database=db_path, read_only=False)
    logging.info(f"Connected to DuckDB database: {db_path}")

    for yyyymmdd_str in dates:
        yyyymmdd_api = yyyymmdd_str.rjust(10, "0")
        hhmmss_api = "0000001000"
        total_rows_inserted_today = 0

        pbar = tqdm(total=0, unit="chunk", dynamic_ncols=True, desc=f"Processing {yyyymmdd_str}")

        while True:
            try:
                news_data = kb.get_news_titles(date_1=yyyymmdd_api, hour_1=hhmmss_api)

                if news_data and len(news_data) > 0:
                    try:
                        news_chunk_df = pd.DataFrame(news_data)
                        chunk_rows = len(news_chunk_df)

                        cols_to_drop_actual = [col for col in dropcols if col in news_chunk_df.columns]
                        if len(cols_to_drop_actual) > 0:
                            news_chunk_df.drop(cols_to_drop_actual, axis=1, inplace=True)

                        con.sql(f"INSERT INTO {table_name} SELECT * FROM news_chunk_df")

                        total_rows_inserted_today += chunk_rows
                        logging.debug(f"Appended {chunk_rows} rows for {yyyymmdd_str} at {hhmmss_api[-6:]} to DuckDB table '{table_name}'.")

                    except Exception as e:
                        logging.error(f"Error processing or inserting chunk for {yyyymmdd_str} at {hhmmss_api[-6:]}: {e}")
                        # break # Optional: Uncomment to stop day on chunk processing error

                current_hms = hhmmss_api[-6:]
                try:
                    dt_obj = datetime.strptime(current_hms, "%H%M%S")
                except ValueError:
                    logging.error(f"Could not parse time: {current_hms} for date {yyyymmdd_str}. Stopping processing for this day.")
                    break

                dt_obj += timedelta(seconds=10)

                if dt_obj.strftime("%H%M%S") >= "235900":
                    break

                hhmmss_api = dt_obj.strftime("%H%M%S").rjust(10, "0")

                time.sleep(1)

                pbar.set_description(f'Date={yyyymmdd_str}, Time={dt_obj.strftime("%H:%M:%S")}, Rows Today={total_rows_inserted_today}')
                pbar.update(1)

            except Exception as e:
                logging.error(f"Error fetching data for {yyyymmdd_str} at API time {hhmmss_api}: {e}")
                break

        pbar.close()
        logging.info(f"Finished processing {yyyymmdd_str}. Total rows inserted today: {total_rows_inserted_today}")

except Exception as e:
    logging.error(f"A critical error occurred during processing: {e}")

finally:
    if con:
        con.close()
        logging.info("Closed DuckDB connection.")

# Final log message
logging.info(f"News processing finished. Logs saved to: {log_file_path}")

print(f"Processing complete. Check log file: {log_file_path}")
print(f"Data stored in DuckDB file: {db_path}, table: {table_name}")