# 01. Data Loader & QA

---

In [45]:
# Library Imports
import pandas as pd
import numpy as np
import os
from pathlib import Path

# 시각화 설정
%matplotlib inline
from IPython.display import display

# 경로 설정
PROJECT_ROOT = Path('.').resolve()
DATA_DIR = PROJECT_ROOT / 'Data_set'

In [46]:
# 데이터셋 경로 확인
dataset_path = DATA_DIR / 'stock_details_5_years.csv'

In [47]:
# 데이터 로드
df_raw = pd.read_csv(dataset_path)

print(f"원본 데이터 Shape: {df_raw.shape}")
df_raw.head()

원본 데이터 Shape: (602962, 9)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 00:00:00-05:00,43.829761,43.863354,42.639594,43.083508,167080000,0.0,0.0,AAPL
1,2018-11-29 00:00:00-05:00,104.769074,105.519257,103.534595,104.636131,28123200,0.0,0.0,MSFT
2,2018-11-29 00:00:00-05:00,54.176498,55.0075,54.099998,54.729,31004000,0.0,0.0,GOOGL
3,2018-11-29 00:00:00-05:00,83.749496,84.499496,82.616501,83.678497,132264000,0.0,0.0,AMZN
4,2018-11-29 00:00:00-05:00,39.692784,40.064904,38.735195,39.037853,54917200,0.04,0.0,NVDA


---

## 3. QA

In [48]:
# 기본 정보 확인
print("=" * 60)
print("데이터 기본 정보")
print("=" * 60)
print(f"행 수: {len(df_raw):,}")
print(f"열 수: {len(df_raw.columns)}")
print(f"컬럼: {list(df_raw.columns)}")
print(f"\n데이터 타입:")
print(df_raw.dtypes)

데이터 기본 정보
행 수: 602,962
열 수: 9
컬럼: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits', 'Company']

데이터 타입:
Date             object
Open            float64
High            float64
Low             float64
Close           float64
Volume            int64
Dividends       float64
Stock Splits    float64
Company          object
dtype: object


In [49]:
# 결측치 확인
print("=" * 60)
print("결측치 분석")
print("=" * 60)
null_counts = df_raw.isnull().sum()
null_percent = (null_counts / len(df_raw)) * 100

null_df = pd.DataFrame({
    '결측치 수': null_counts,
    '결측치 비율(%)': null_percent.round(2)
})
print(null_df)

결측치 분석
              결측치 수  결측치 비율(%)
Date              0        0.0
Open              0        0.0
High              0        0.0
Low               0        0.0
Close             0        0.0
Volume            0        0.0
Dividends         0        0.0
Stock Splits      0        0.0
Company           0        0.0


In [50]:
# 중복값 확인
print("=" * 60)
print("중복값 분석")
print("=" * 60)
duplicate_rows = df_raw.duplicated().sum()
print(f"중복 행 수: {duplicate_rows:,}")
print(f"중복 비율: {(duplicate_rows / len(df_raw)) * 100:.2f}%")

중복값 분석
중복 행 수: 0
중복 비율: 0.00%


In [51]:
# 기업(Company) 목록 확인
print("=" * 60)
print("기업 정보")
print("=" * 60)
companies = df_raw['Company'].unique()
print(f"총 기업 수: {len(companies)}")
print(f"기업 목록 (처음 20개): {list(companies[:20])}")

기업 정보
총 기업 수: 491
기업 목록 (처음 20개): ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'TSLA', 'LLY', 'V', 'TSM', 'UNH', 'AVGO', 'NVO', 'JPM', 'WMT', 'XOM', 'MA', 'JNJ', 'PG', 'ORCL']


---

## 4. 날짜 전처리

일봉 데이터 분석을 위해 시간 정보를 제거하고 YYYY-MM-DD 형식으로 변환함

### 전처리 이유
1. **데이터의 성격**: 일봉 데이터는 하루의 시세를 요약한 것이므로 시간 정보는 불필요한 노이즈임
2. **그룹화 정확성**: 시간 정보가 포함되면 같은 날짜도 다른 그룹으로 분류될 수 있음
3. **시각화 최적화**: 시간 정보 제거 시 X축 표현이 깔끔해짐

In [52]:
df = df_raw.copy()

# 날짜 변환 (시간 정보 제거)
df['Date'] = pd.to_datetime(df['Date'], utc=True).dt.date
df['Date'] = pd.to_datetime(df['Date'])

# 정렬 (Company, Date 기준)
df = df.sort_values(by=['Company', 'Date']).reset_index(drop=True)

print(f"날짜 범위: {df['Date'].min()} ~ {df['Date'].max()}")
df.head()

날짜 범위: 2018-11-29 00:00:00 ~ 2023-11-29 00:00:00


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29,68.673458,69.589358,68.673458,69.001251,2625800,0.0,0.0,A
1,2018-11-30,69.059076,70.04247,68.779483,69.753235,2279500,0.0,0.0,A
2,2018-12-03,70.698073,72.105671,70.563099,71.989975,4265200,0.0,0.0,A
3,2018-12-04,72.018907,72.414195,70.023203,70.293159,4263800,0.0,0.0,A
4,2018-12-06,68.644532,69.348335,67.391195,69.329056,3505900,0.0,0.0,A


---

## 5. 섹터 정보 매핑

기업별 섹터 정보를 매핑하여 섹터 분석이 가능하도록 함

In [53]:
# 섹터 매핑 정의
SECTOR_MAP = {
    'AAPL': 'Technology', 'MSFT': 'Technology', 'NVDA': 'Technology', 
    'TSM': 'Technology', 'AVGO': 'Technology', 'ORCL': 'Technology',
    'GOOGL': 'Communication', 'META': 'Communication',
    'AMZN': 'Consumer Cyclical', 'TSLA': 'Consumer Cyclical',
    'LLY': 'Healthcare', 'UNH': 'Healthcare', 'NVO': 'Healthcare', 'JNJ': 'Healthcare',
    'JPM': 'Financial', 'V': 'Financial', 'MA': 'Financial',
    'XOM': 'Energy',
    'WMT': 'Consumer Defensive', 'PG': 'Consumer Defensive'
}

df['Sector'] = df['Company'].map(SECTOR_MAP).fillna('Others')

# 섹터별 기업 수 확인
sector_counts = df.groupby('Sector')['Company'].nunique().sort_values(ascending=False)
print("섹터별 기업 수:")
print(sector_counts)

섹터별 기업 수:
Sector
Others                471
Technology              6
Healthcare              4
Financial               3
Communication           2
Consumer Cyclical       2
Consumer Defensive      2
Energy                  1
Name: Company, dtype: int64


---

## 6. 파생 지표 생성

투자 의사결정에 유효한 기술적 파생 변수를 생성함

### 6.1 일간 수익률 (Daily Return)

전일 종가 대비 당일 종가의 변화율을 계산함

In [54]:
df['Daily_Return'] = df.groupby('Company')['Close'].pct_change()

print("일간 수익률 통계:")
print(df['Daily_Return'].describe())

일간 수익률 통계:
count    602471.000000
mean          0.000742
std           0.024239
min          -0.529074
25%          -0.009768
50%           0.000758
75%           0.011137
max           1.205494
Name: Daily_Return, dtype: float64


### 6.2 누적 수익률

첫 거래일 대비 누적 수익률을 계산함

In [55]:
df['Cum_Return'] = df.groupby('Company')['Close'].transform(
    lambda x: (x / x.iloc[0]) - 1
)

print("누적 수익률 통계:")
print(df['Cum_Return'].describe())

누적 수익률 통계:
count    602962.000000
mean          0.567193
std           1.129756
min          -0.900908
25%           0.038039
50%           0.310041
75%           0.766310
max          30.406603
Name: Cum_Return, dtype: float64


### 6.3 이동평균선

5일, 20일, 60일 이동평균을 계산하여 추세 분석에 활용함

In [56]:
for window in [5, 20, 60]:
    df[f'MA_{window}'] = df.groupby('Company')['Close'].transform(
        lambda x: x.rolling(window).mean()
    )

print("이동평균 컬럼 생성 완료")
df[['Company', 'Date', 'Close', 'MA_5', 'MA_20', 'MA_60']].tail()

이동평균 컬럼 생성 완료


Unnamed: 0,Company,Date,Close,MA_5,MA_20,MA_60
602957,ZTS,2023-11-22,179.350006,177.095999,167.214238,173.920453
602958,ZTS,2023-11-24,180.210007,177.830002,168.340116,173.719958
602959,ZTS,2023-11-27,178.789993,178.628,169.49675,173.532209
602960,ZTS,2023-11-28,176.970001,178.810001,170.514,173.280706
602961,ZTS,2023-11-29,176.460007,178.356003,171.487,173.055453


### 6.4 변동성

20일간 일간 수익률의 표준편차로 리스크 측정

In [57]:
df['Volatility_20d'] = df.groupby('Company')['Daily_Return'].transform(
    lambda x: x.rolling(20).std()
)

print("변동성 통계:")
print(df['Volatility_20d'].describe())

변동성 통계:
count    593142.000000
mean          0.020378
std           0.013289
min           0.001131
25%           0.012450
50%           0.017062
75%           0.023702
max           0.319055
Name: Volatility_20d, dtype: float64


### 6.5 최대 낙폭 (MDD)

고점 대비 최대 하락률을 계산하여 리스크 평가

In [58]:
# 누적 최대값
df['Cum_Max'] = df.groupby('Company')['Close'].transform(
    lambda x: x.expanding().max()
)

# 현재 낙폭
df['Drawdown'] = (df['Close'] - df['Cum_Max']) / df['Cum_Max']

# MDD (기간 내 최저 낙폭)
df['MDD'] = df.groupby('Company')['Drawdown'].transform(
    lambda x: x.expanding().min()
)

print("MDD 통계 (음수여야 정상):")
print(df['MDD'].describe())

MDD 통계 (음수여야 정상):
count    602962.000000
mean         -0.396125
std           0.184992
min          -0.908979
25%          -0.521135
50%          -0.385543
75%          -0.255640
max           0.000000
Name: MDD, dtype: float64


### 6.6 거래량 이상치 분석

20일 평균 대비 거래량 비율 및 Z-Score를 계산하여 이상 거래량 탐지

In [59]:
# 20일 거래량 이동평균
df['Vol_MA_20'] = df.groupby('Company')['Volume'].transform(
    lambda x: x.rolling(20).mean()
)

# 거래량 비율 (평균 대비 배수)
df['Vol_Ratio'] = df['Volume'] / df['Vol_MA_20']

# Z-Score (표준편차 대비 이탈도, 3 이상이면 통계적 이상치)
df['Vol_Std_20'] = df.groupby('Company')['Volume'].transform(
    lambda x: x.rolling(20).std()
)
df['Vol_Z_Score'] = (df['Volume'] - df['Vol_MA_20']) / df['Vol_Std_20']

print("거래량 Z-Score 통계:")
print(df['Vol_Z_Score'].describe())

거래량 Z-Score 통계:
count    593633.000000
mean         -0.003192
std           1.045007
min          -3.298113
25%          -0.707117
50%          -0.241870
75%           0.471838
max           4.248430
Name: Vol_Z_Score, dtype: float64


### 6.7 갭 분석

전일 종가 대비 당일 시가의 차이로 장 시작 전 뉴스/이슈 반영 정도를 파악함

In [60]:
# 전일 종가
df['Prev_Close'] = df.groupby('Company')['Close'].shift(1)

# 갭 (절대값)
df['Gap'] = df['Open'] - df['Prev_Close']

# 갭 비율 (%)
df['Gap_Pct'] = (df['Open'] / df['Prev_Close']) - 1

print("갭 비율 통계:")
print(df['Gap_Pct'].describe())

갭 비율 통계:
count    602471.000000
mean          0.000492
std           0.015216
min          -0.476407
25%          -0.004600
50%           0.000567
75%           0.005690
max           0.625000
Name: Gap_Pct, dtype: float64


In [61]:
df_daily = df.copy()

print(f"Daily Master Shape: {df_daily.shape}")
print(f"\n컬럼 목록:")
print(list(df_daily.columns))

df_daily.sample(5)

Daily Master Shape: (602962, 26)

컬럼 목록:
['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits', 'Company', 'Sector', 'Daily_Return', 'Cum_Return', 'MA_5', 'MA_20', 'MA_60', 'Volatility_20d', 'Cum_Max', 'Drawdown', 'MDD', 'Vol_MA_20', 'Vol_Ratio', 'Vol_Std_20', 'Vol_Z_Score', 'Prev_Close', 'Gap', 'Gap_Pct']


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company,Sector,...,Cum_Max,Drawdown,MDD,Vol_MA_20,Vol_Ratio,Vol_Std_20,Vol_Z_Score,Prev_Close,Gap,Gap_Pct
177905,2018-12-04,23.913695,23.920917,23.343292,23.408272,284200,0.0,0.0,E,Others,...,23.97867,-0.023788,-0.023788,,,,,23.97867,-0.06497547,-0.00270972
246162,2022-03-07,104.53949,104.739065,101.441318,103.247002,3009800,0.0,0.0,GRMN,Others,...,168.021835,-0.385514,-0.385514,1605215.0,1.875014,876802.3,1.60194,105.0812,-0.5417092,-0.005155148
214828,2020-09-04,15.068682,15.549802,14.616428,15.415089,21907000,0.0,0.0,FCX,Others,...,15.655651,-0.015366,-0.620855,16367175.0,1.338472,4787650.0,1.157107,14.972458,0.0962237,0.006426714
157237,2021-06-28,106.0,107.190002,105.459999,106.440002,1452700,0.0,0.0,DDOG,Others,...,117.849998,-0.096818,-0.420916,2684315.0,0.541181,1182787.0,-1.041283,106.059998,-0.05999756,-0.0005656945
294149,2021-07-21,4.156082,4.230966,4.141105,4.223478,26187942,0.0,0.0,ITUB,Others,...,7.02465,-0.398763,-0.620202,45817120.55,0.571575,11572560.0,-1.696183,4.156081,6.002931e-07,1.444373e-07


---

## 8. Sector Summary Dataset 생성

섹터별 일간 집계 데이터를 생성하여 섹터 분석에 활용함

In [62]:
df_sector = df_daily.groupby(['Date', 'Sector']).agg({
    'Daily_Return': 'mean',
    'Close': 'mean',
    'Vol_Ratio': 'mean',
    'Volatility_20d': 'mean',
    'MDD': 'mean',
    'Company': 'count'
}).reset_index()

df_sector = df_sector.rename(columns={
    'Daily_Return': 'Sector_Return',
    'Close': 'Sector_Price_Avg',
    'Vol_Ratio': 'Sector_Vol_Ratio',
    'Volatility_20d': 'Sector_Volatility',
    'MDD': 'Sector_MDD',
    'Company': 'Stock_Count'
})

print(f"Sector Summary Shape: {df_sector.shape}")
df_sector.head()

Sector Summary Shape: (10064, 8)


Unnamed: 0,Date,Sector,Sector_Return,Sector_Price_Avg,Sector_Vol_Ratio,Sector_Volatility,Sector_MDD,Stock_Count
0,2018-11-29,Communication,,96.704496,,,0.0,2
1,2018-11-29,Consumer Cyclical,,53.211582,,,0.0,2
2,2018-11-29,Consumer Defensive,,85.580177,,,0.0,2
3,2018-11-29,Energy,,61.217907,,,0.0,1
4,2018-11-29,Financial,,140.095683,,,0.0,3


---

## 9. Monthly Summary Dataset 생성

월간 집계 데이터를 생성하여 장기 추세 분석에 활용함

In [63]:
df_monthly = df[['Company', 'Sector', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume']].copy()
df_monthly = df_monthly.set_index('Date').groupby('Company').resample('ME').agg({
    'Sector': 'first',
    'Open': 'first',
    'High': 'max',
    'Low': 'min',
    'Close': 'last',
    'Volume': 'sum'
}).reset_index()

# 월간 수익률
df_monthly['Monthly_Return'] = df_monthly.groupby('Company')['Close'].pct_change() * 100

# 월간 변동폭
df_monthly['Monthly_Range'] = (df_monthly['High'] - df_monthly['Low']) / df_monthly['Open'] * 100

print(f"Monthly Summary Shape: {df_monthly.shape}")
df_monthly.head()

Monthly Summary Shape: (29232, 10)


Unnamed: 0,Company,Date,Sector,Open,High,Low,Close,Volume,Monthly_Return,Monthly_Range
0,A,2018-11-30,Others,68.673458,70.04247,68.673458,69.753235,4905300,,1.99351
1,A,2018-12-31,Others,70.698073,72.414195,59.986839,65.1996,50474000,-6.528206,17.57807
2,A,2019-01-31,Others,64.271775,73.75306,59.922551,73.501778,44194500,12.733479,21.518791
3,A,2019-02-28,Others,73.608101,77.164765,72.255001,76.778175,35941700,4.457576,6.67014
4,A,2019-03-31,Others,77.319419,79.513353,74.806536,77.686691,32806600,1.1833,6.087497


---

## 10. 데이터 저장

In [64]:
# CSV 형식으로 저장
df_daily.to_csv(DATA_DIR / 'stock_daily_master.csv', index=False)
df_sector.to_csv(DATA_DIR / 'stock_sector_summary.csv', index=False)
df_monthly.to_csv(DATA_DIR / 'stock_monthly_summary.csv', index=False)

print("[저장 완료]")
print(f"  1. Daily Master: {df_daily.shape} -> stock_daily_master.csv")
print(f"  2. Sector Summary: {df_sector.shape} -> stock_sector_summary.csv")
print(f"  3. Monthly Summary: {df_monthly.shape} -> stock_monthly_summary.csv")

[저장 완료]
  1. Daily Master: (602962, 26) -> stock_daily_master.csv
  2. Sector Summary: (10064, 8) -> stock_sector_summary.csv
  3. Monthly Summary: (29232, 10) -> stock_monthly_summary.csv
