In [17]:
from datetime import datetime
import pandas as pd
from pathlib import Path

root = Path('/Users/hwang-yejin/Desktop/Financial Time Series Forecasting with Deep Learning Models and Social Media Sentiment')
anchor = pd.to_datetime('2025-02-01')
train_len, pred_len = 96, 5

def window_from_anchor(csv_path):
    df = pd.read_csv(csv_path)
    # 1) 컬럼명 소문자 통일
    df.columns = [str(c).strip().lower() for c in df.columns]
    # 2) date 컬럼 탐색 (date 없으면 흔한 대체 컬럼 찾기)
    date_col = 'date'
    if date_col not in df.columns:
        for cand in ('datetime', 'timestamp', 'created', 'day'):
            if cand in df.columns:
                df[date_col] = pd.to_datetime(df[cand], errors='coerce', utc=True).dt.tz_localize(None).dt.floor('D')
                break
    # 마지막 방어: 그래도 없으면 에러 리턴
    if date_col not in df.columns:
        return csv_path, 'NO_DATE_COL', None, None, None, None, None

    df[date_col] = pd.to_datetime(df[date_col], errors='coerce', utc=True).dt.tz_localize(None)
    df = df.dropna(subset=[date_col]).sort_values(date_col).reset_index(drop=True)

    # first trading day on/after anchor
    start_idx = df.index[df[date_col] >= anchor]
    if len(start_idx) == 0:
        return csv_path, None, None, None, None, None, None
    start_idx = int(start_idx[0])

    total_needed = train_len + pred_len
    end_idx = min(start_idx + total_needed, len(df))
    # if not enough rows, backshift window to fit
    if end_idx - start_idx < total_needed:
        end_idx = len(df)
        start_idx = max(0, end_idx - total_needed)

    win = df.iloc[start_idx:end_idx].copy()
    train = win.iloc[:train_len]
    test = win.iloc[train_len:train_len+pred_len]
    return (
        csv_path,
        train[date_col].min().date(), train[date_col].max().date(), len(train),
        test[date_col].min().date(), test[date_col].max().date(), len(test)
    )

paths = [
    root/'data/TSLA_close.csv',
    root/'data/interim/TSLA_price_full.csv',
    root/'data/processed/tsla_price_sentiment_spike_merged_20220721_20250915.csv'
]

rows = []
for p in paths:
    rows.append(window_from_anchor(p))

df_out = pd.DataFrame(rows, columns=[
    'file','train_start','train_end','train_rows','test_start','test_end','test_rows'
])
print(df_out.to_string(index=False))


df_out

                                                                                                                                                                                    file train_start  train_end  train_rows test_start   test_end  test_rows
                                                   /Users/hwang-yejin/Desktop/Financial Time Series Forecasting with Deep Learning Models and Social Media Sentiment/data/TSLA_close.csv  2025-02-03 2025-06-20          96 2025-06-23 2025-06-27          5
                                      /Users/hwang-yejin/Desktop/Financial Time Series Forecasting with Deep Learning Models and Social Media Sentiment/data/interim/TSLA_price_full.csv  2025-02-03 2025-06-20          96 2025-06-23 2025-06-27          5
/Users/hwang-yejin/Desktop/Financial Time Series Forecasting with Deep Learning Models and Social Media Sentiment/data/processed/tsla_price_sentiment_spike_merged_20220721_20250915.csv  2025-02-03 2025-06-20          96 2025-06-23 2025-06-27

  df[date_col] = pd.to_datetime(df[date_col], errors='coerce', utc=True).dt.tz_localize(None)


Unnamed: 0,file,train_start,train_end,train_rows,test_start,test_end,test_rows
0,/Users/hwang-yejin/Desktop/Financial Time Seri...,2025-02-03,2025-06-20,96,2025-06-23,2025-06-27,5
1,/Users/hwang-yejin/Desktop/Financial Time Seri...,2025-02-03,2025-06-20,96,2025-06-23,2025-06-27,5
2,/Users/hwang-yejin/Desktop/Financial Time Seri...,2025-02-03,2025-06-20,96,2025-06-23,2025-06-27,5
