In [5]:
import pandas as pd
from pathlib import Path
import numpy as np

In [6]:
# ƒê·ªçc d·ªØ li·ªáu t·ª´ file CSV ƒë√£ x·ª≠ l√Ω k·ªπ thu·∫≠t
INPUT_PATH = Path("../../data/processed/ta/ta_data_technical_indicators.csv")
df = pd.read_csv(INPUT_PATH)
print(f"\nüìÇ Data loaded from: {INPUT_PATH}")


üìÇ Data loaded from: ..\..\data\processed\ta\ta_data_technical_indicators.csv


In [7]:
# ---------------------------------------------------------
# Feature Engineering (Th√™m c√°c ch·ªâ b√°o k·ªπ thu·∫≠t ph√°i sinh)
# ---------------------------------------------------------

# 1. Volume Spike: Kh·ªëi l∆∞·ª£ng tƒÉng ƒë·ªôt bi·∫øn (> 1.5 l·∫ßn trung b√¨nh 20 phi√™n)
# Gi·∫£ ƒë·ªãnh c·ªôt 'volume_ratio' = volume / volume_ma_20 ƒë√£ c√≥ s·∫µn (t·ª´ ta_scraping ho·∫∑c ta_calculation)
if 'volume_ratio' in df.columns:
    df['Volume_Spike'] = (df['volume_ratio'] > 1.5).astype(int)
else:
    print("‚ö†Ô∏è Warning: 'volume_ratio' column not found. Skipping Volume_Spike.")

# 2. RSI Signals
if 'rsi_14' in df.columns:
    df['RSI_Oversold'] = (df['rsi_14'] < 30).astype(int)
    df['RSI_Overbought'] = (df['rsi_14'] > 70).astype(int)
else:
    print("‚ö†Ô∏è Warning: 'rsi_14' column not found. Skipping RSI signals.")

# 3. Price vs MA (Trend)
# Divergence > 0 nghƒ©a l√† gi√° ƒëang n·∫±m tr√™n ƒë∆∞·ªùng MA
if 'ma_20_divergence' in df.columns:
    df['Price_Above_MA20'] = (df['ma_20_divergence'] > 0).astype(int)

if 'ma_50_divergence' in df.columns:
    df['Price_Above_MA50'] = (df['ma_50_divergence'] > 0).astype(int)

print("‚úÖ Added derived technical features (Volume_Spike, RSI_Signals, Trend).")

‚úÖ Added derived technical features (Volume_Spike, RSI_Signals, Trend).


In [8]:
# ---------------------------------------------------------
# T√≠nh Future Return (L·ª£i nhu·∫≠n t∆∞∆°ng lai sau 7 ng√†y)
# ---------------------------------------------------------

# C·∫ßn convert time column n·∫øu ch∆∞a
if 'time' in df.columns:
    df['time'] = pd.to_datetime(df['time'])

# Sort by symbol and time ƒë·ªÉ ƒë·∫£m b·∫£o th·ª© t·ª± ƒë√∫ng
df = df.sort_values(['symbol', 'time']).reset_index(drop=True)

# T√≠nh Future Return cho t·ª´ng m√£ c·ªï phi·∫øu
def calculate_future_return(group):
    # Shift close price l√™n 7 ng√†y (gi√° 7 ng√†y sau)
    group['Future_Close'] = group['close'].shift(-7)
    # T√≠nh % return
    group['Future_Return_7d'] = (group['Future_Close'] - group['close']) / group['close']
    return group

df = df.groupby('symbol', group_keys=False).apply(calculate_future_return)

print("\n‚úÖ Future_Return_7d calculated successfully.")
print(f"   Non-NaN values: {df['Future_Return_7d'].notna().sum()}")
print(f"   NaN values (last 7 days per stock): {df['Future_Return_7d'].isna().sum()}")
print(f"\nüìä Future Return statistics:")
print(df['Future_Return_7d'].describe())


‚úÖ Future_Return_7d calculated successfully.
   Non-NaN values: 65622
   NaN values (last 7 days per stock): 700

üìä Future Return statistics:
count    65622.000000
mean         0.006034
std          0.070070
min         -0.495327
25%         -0.023762
50%          0.000000
75%          0.028388
max          1.119388
Name: Future_Return_7d, dtype: float64


In [9]:
# T·∫°o bi·∫øn m·ª•c ti√™u 'Target' v·ªõi ph√¢n lo·∫°i BINARY:
# 1 = MUA (l√£i su·∫•t > 2% sau 7 ng√†y)
# 0 = KH√îNG MUA (l√£i su·∫•t <= 2% sau 7 ng√†y)
def classify_target(row):
    # Handle NaN Future_Return_7d (last 7 days of data)
    if pd.isna(row['Future_Return_7d']):
        return np.nan
        
    if row['Future_Return_7d'] > 0.02:
        return 1  # MUA - L·ª£i nhu·∫≠n cao
    else:
        return 0  # KH√îNG MUA - L·ª£i nhu·∫≠n th·∫•p ho·∫∑c l·ªó

df['Target'] = df.apply(classify_target, axis=1)
print("\n‚úÖ Binary Target variable 'Target' created successfully.")
print(f"   NaN Targets (last 7 days): {df['Target'].isna().sum()}")
print(f"\nüéØ Target distribution:")
print(df['Target'].value_counts().sort_index())
print(f"\nüìã Target distribution (%):")
print(df['Target'].value_counts(normalize=True).sort_index() * 100)
df.head()


‚úÖ Binary Target variable 'Target' created successfully.
   NaN Targets (last 7 days): 700

üéØ Target distribution:
Target
0.0    45850
1.0    19772
Name: count, dtype: int64

üìã Target distribution (%):
Target
0.0    69.869861
1.0    30.130139
Name: proportion, dtype: float64


Unnamed: 0,time,open,high,low,close,volume,symbol,price_range,price_range_pct,body_size_pct,...,minus_di,adx,Volume_Spike,RSI_Oversold,RSI_Overbought,Price_Above_MA20,Price_Above_MA50,Future_Close,Future_Return_7d,Target
0,2023-01-03,11.28,11.28,11.0,11.19,3601,AAM,0.28,2.545455,0.797872,...,0.0,,0,0,0,0,0,10.81,-0.033959,0.0
1,2023-01-04,10.67,11.0,10.67,10.91,3900,AAM,0.33,3.092784,2.249297,...,41.25,100.0,0,1,0,0,0,11.1,0.017415,0.0
2,2023-01-05,10.44,11.19,10.44,11.19,2332,AAM,0.75,7.183908,7.183908,...,36.129032,100.0,0,0,0,1,1,10.53,-0.058981,0.0
3,2023-01-06,11.0,11.14,10.96,10.96,600,AAM,0.18,1.642336,0.363636,...,31.460674,100.0,0,0,0,0,0,11.0,0.00365,0.0
4,2023-01-09,11.28,11.28,10.91,10.96,441,AAM,0.37,3.391384,2.836879,...,26.046512,90.0,0,0,0,0,0,10.72,-0.021898,0.0


In [None]:
# L∆∞u d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω v√†o file m·ªõi
OUTPUT_PATH = Path("../../data/processed/ta/ta_data_with_target.csv")
df.to_csv(OUTPUT_PATH, index=False)
print(f"\nüíæ Data with target saved to: {OUTPUT_PATH}")


üíæ Data with target saved to: ..\..\data\processed\ta\ta_data_with_target.csv
