# Imports

In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Read Data

In [124]:
df = pd.read_csv('./AAPL(80-24) Final.csv')

# Helper Functions

In [125]:
# Convert 'Vol.' and 'Change %' to numeric (same as preprocessing.py)
def convert_volume_to_numeric(volume_str):
    if isinstance(volume_str, str):
        if 'M' in volume_str:
            return float(volume_str.replace('M', '')) * 1e6
        elif 'B' in volume_str:
            return float(volume_str.replace('B', '')) * 1e9
    return float(volume_str)

def convert_change_to_numeric(change_str):
    try:
        if isinstance(change_str, str) and '%' in change_str:
            return float(change_str.replace('%', '')) / 100
        return float(change_str)
    except ValueError:
        return np.nan

# Data Processing

In [126]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

df['Year'] = df['Date'].dt.year
# df['Month'] = df['Date'].dt.month # 12 discrete states
df['Day'] = df['Date'].dt.day
df['Day_of_week'] = df['Date'].dt.dayofweek + 1 # 7 discrete states(1 to 7)
df['Quarter'] = df['Date'].dt.quarter # 4 discrete states (1-4)

In [127]:
df['Vol.'] = df['Vol.'].apply(convert_volume_to_numeric)
df['Change %'] = df['Change %'].apply(convert_change_to_numeric)

In [128]:
numeric_cols = ['Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

In [129]:
print(df.isnull().sum())

Date           0
Price          0
Open           0
High           0
Low            0
Vol.           0
Change %       0
Year           0
Day            0
Day_of_week    0
Quarter        0
dtype: int64


# Target Creation

In [130]:
QUARTER_DAYS = 63
df['Price_future_quarter'] = df['Price'].shift(-QUARTER_DAYS)
df['Quarterly_change_pct'] = ((df['Price_future_quarter'] - df['Price']) / df['Price']) * 100

# Binary direction target
df['Direction_quarter'] = (df['Quarterly_change_pct'] > 0).astype(int)

# Multi-class trend classification
valid_changes = df['Quarterly_change_pct'].dropna()
valid_changes_neg = valid_changes[valid_changes < 0]
valid_changes_pos = valid_changes[valid_changes > 0]

thresholds = {
    'large_dec': np.percentile(valid_changes_neg, 10),
    'moderate_dec': np.percentile(valid_changes_neg, 30),
    'moderate_inc': np.percentile(valid_changes_pos, 70),
    'large_inc': np.percentile(valid_changes_pos, 90)
}

conditions = [
    # Negative classes
    (df['Quarterly_change_pct'] < thresholds['large_dec']),
    (df['Quarterly_change_pct'] >= thresholds['large_dec']) & (df['Quarterly_change_pct'] < thresholds['moderate_dec']),
    (df['Quarterly_change_pct'] >= thresholds['moderate_dec']) & (df['Quarterly_change_pct'] < 0),

    # Positive classes
    (df['Quarterly_change_pct'] >= 0) & (df['Quarterly_change_pct'] < thresholds['moderate_inc']),
    (df['Quarterly_change_pct'] >= thresholds['moderate_inc']) & (df['Quarterly_change_pct'] < thresholds['large_inc']),
    (df['Quarterly_change_pct'] >= thresholds['large_inc'])
]

labels = [
    'large_decrease',
    'moderate_decrease',
    'small_decrease',
    'small_increase',
    'moderate_increase',
    'large_increase'
]

df['Trend_class'] = np.select(conditions, labels, default=np.nan)

In [131]:
df = df.dropna().reset_index(drop=True)

# Encoding

In [132]:
encoder = LabelEncoder()
df['Trend_class'] = encoder.fit_transform(df['Trend_class'])

print("Label Encoding Mapping:")
for i, class_name in enumerate(encoder.classes_):
    print(f"{class_name}: {i}")

Label Encoding Mapping:
large_decrease: 0
large_increase: 1
moderate_decrease: 2
moderate_increase: 3
small_decrease: 4
small_increase: 5


# Feature Binning

In [133]:
df['Year_bin'] = pd.qcut(df['Year'], q=5, labels=False, duplicates='drop')
df['Day_bin'] = pd.qcut(df['Day'], q=3, labels=False, duplicates='drop')
df['Day_of_week_bin'] = df['Day_of_week']
df['Quarter_bin'] = df['Quarter']

numeric_features = ['Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']

for col in numeric_features:
    df[f'{col}_bin'] = pd.qcut(df[col], q=5, labels=False, duplicates='drop')

# HMM Observation Creation

In [134]:
# Combine all bins into a single observation code
features = ['Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']
features_w_temporal = ['Price', 'Open', 'High', 'Low', 'Vol.', 'Change %', 'Day_of_week', 'Quarter']

df['observation'] = df[[f'{col}_bin' for col in features]].astype(str).agg('-'.join, axis=1)
df['observation_w_temporal'] = df[[f'{col}_bin' for col in features_w_temporal]].astype(str).agg('-'.join, axis=1)
encoder1= LabelEncoder()
encoder2 = LabelEncoder()
df['observation_code'] = encoder1.fit_transform(df['observation'])
df['observation_code_w_temporal'] = encoder2.fit_transform(df['observation_w_temporal'])

In [135]:
print(df['observation_code'].nunique())
print(df['observation_code_w_temporal'].nunique())

224
1757


# Train Test Split

In [136]:
# Split into train/test (temporal split)
split_idx = int(0.8 * len(df))
train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]

In [137]:
train_df.to_csv("final_train.csv", index=False)
test_df.to_csv("final_test.csv", index=False)

# Gaussian HMM Features

In [138]:
features_gaussian = ['Price', 'Open', 'High', 'Low', 'Vol.', 'Change %', 'Day_of_week', 'Quarter']
scaler = StandardScaler()
df_standardized = pd.DataFrame(scaler.fit_transform(df[features_gaussian]), columns=features_gaussian)

df_standardized['Trend_class'] = df['Trend_class'].values
df_standardized['Direction_quarter'] = df['Direction_quarter'].values

In [139]:
df_standardized

Unnamed: 0,Price,Open,High,Low,Vol.,Change %,Day_of_week,Quarter,Trend_class,Direction_quarter
0,-0.475003,-0.475112,-0.475089,-0.474969,0.437682,-22.465808,1.415156,1.332052,2,0
1,-0.475242,-0.475351,-0.475325,-0.475210,-0.432551,-1.756767,-1.441088,1.332052,4,0
2,-0.475481,-0.475590,-0.475561,-0.475452,-0.640796,-1.900533,-0.727027,1.332052,5,0
3,-0.475242,-0.475351,-0.475325,-0.475210,-0.698059,2.012598,-0.012966,1.332052,4,0
4,-0.475242,-0.475351,-0.475325,-0.475210,-0.736621,-0.029329,0.701095,1.332052,4,0
...,...,...,...,...,...,...,...,...,...,...
10844,4.224148,4.209751,4.177078,4.252100,-0.835237,0.091974,-0.727027,1.332052,4,0
10845,4.173768,4.227436,4.194333,4.226504,-0.805848,-0.269687,-0.012966,1.332052,4,0
10846,4.170187,4.208317,4.180151,4.194388,-0.823808,-0.047299,0.701095,1.332052,4,0
10847,4.144400,4.186330,4.140678,4.181590,-0.844380,-0.152878,1.415156,1.332052,4,0


In [140]:
# Split into train/test (temporal split)
split_idx = int(0.8 * len(df_standardized))
train_df = df_standardized.iloc[:split_idx]
test_df = df_standardized.iloc[split_idx:]

In [143]:
train_df.to_csv("final_train.csv", index=False)
test_df.to_csv("final_test.csv", index=False)