# Imports and Installation

In [1]:
import pandas as pd
import numpy as np
import holidays
from datetime import date
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split

In [79]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"charlieshang","key":"e0c09b884281c06657fa2994bdfbb351"}'}

In [80]:
!pip install kaggle
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



In [81]:
!kaggle datasets download -d shiivvvaam/apple-stock-market-historical-data-1980-2024

Dataset URL: https://www.kaggle.com/datasets/shiivvvaam/apple-stock-market-historical-data-1980-2024
License(s): CC0-1.0
apple-stock-market-historical-data-1980-2024.zip: Skipping, found more recently modified local copy (use --force to force download)


In [94]:
import zipfile
zip_ref = zipfile.ZipFile('/content/apple-stock-market-historical-data-1980-2024.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

# Read Data

In [95]:
df = pd.read_csv('/content/AAPL(80-24) Final.csv')
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,03/27/2024,173.31,170.3,173.58,170.14,59.11M,2.12%
1,03/26/2024,169.71,170.01,171.41,169.65,57.22M,-0.67%
2,03/25/2024,170.85,170.37,171.94,169.46,54.21M,-0.83%
3,03/22/2024,172.28,171.76,173.05,170.06,71.16M,0.53%
4,03/21/2024,171.37,177.05,177.49,170.84,106.18M,-4.09%


# Data Info

In [96]:
print(df.isnull().sum())

Date        0
Price       0
Open        0
High        0
Low         0
Vol.        1
Change %    0
dtype: int64


In [97]:
print(df.dtypes)

Date         object
Price       float64
Open        float64
High        float64
Low         float64
Vol.         object
Change %     object
dtype: object


# Date Processing

In [98]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

df['Year'] = df['Date'].dt.year
# df['Month'] = df['Date'].dt.month # 12 discrete states
df['Day'] = df['Date'].dt.day
df['Day_of_week'] = df['Date'].dt.dayofweek + 1 # 7 discrete states(1 to 7)
df['Quarter'] = df['Date'].dt.quarter # 4 discrete states (1-4)

In [99]:
def convert_volume_to_numeric(volume_str):
  if isinstance(volume_str, str):
    if 'M' in volume_str:
        return float(volume_str.replace('M', '')) * 1_000_000
    elif 'B' in volume_str:
        return float(volume_str.replace('B', '')) * 1_000_000_000
    return float(volume_str)

df['Vol.'] = df['Vol.'].apply(convert_volume_to_numeric)


In [100]:
def convert_change_to_numeric(change_str):
  try:
    if isinstance(change_str, str):
        if '%' in change_str:
            return float(change_str.replace('%', '')) / 100
    return float(change_str)
  except ValueError:
    return None

df['Change %'] = df['Change %'].apply(convert_change_to_numeric)

In [101]:
numeric_columns = df.select_dtypes(include='float64').columns
print(numeric_columns)

Index(['Price', 'Open', 'High', 'Low', 'Vol.', 'Change %'], dtype='object')


In [102]:
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

In [103]:
print(df.isnull().sum())

Date           0
Price          0
Open           0
High           0
Low            0
Vol.           0
Change %       0
Year           0
Day            0
Day_of_week    0
Quarter        0
dtype: int64


# Feature Engineering

In [48]:
# us_holidays = holidays.US(years=df['Date'].dt.year.unique())
# holiday_dates = [pd.Timestamp(d) for d in us_holidays.keys()]
# df['Is_holiday'] = df['Date'].isin(holiday_dates).astype(int)
# df['Is_month_end'] = df['Date'].dt.is_month_end.astype(int)
# df['Is_quarter_end'] = df['Date'].dt.is_quarter_end.astype(int)

In [105]:
df['Year_bin'] = pd.qcut(df['Year'], q=5, labels=False, duplicates='drop')
df['Day_bin'] = pd.qcut(df['Day'], q=3, labels=False, duplicates='drop')

# Target Creation

In [106]:
QUARTER_DAYS = 63
df['Price_future_quarter'] = df['Price'].shift(-QUARTER_DAYS)
df['Quarterly_change_pct'] = ((df['Price_future_quarter'] - df['Price']) / df['Price']) * 100

# Binary direction target
df['Direction_quarter'] = (df['Quarterly_change_pct'] > 0).astype(int)

# Multi-class trend classification
valid_changes = df['Quarterly_change_pct'].dropna()
valid_changes_neg = valid_changes[valid_changes < 0]
valid_changes_pos = valid_changes[valid_changes > 0]

thresholds = {
    'large_dec': np.percentile(valid_changes_neg, 10),
    'moderate_dec': np.percentile(valid_changes_neg, 30),
    'moderate_inc': np.percentile(valid_changes_pos, 70),
    'large_inc': np.percentile(valid_changes_pos, 90)
}

conditions = [
    # Negative classes
    (df['Quarterly_change_pct'] < thresholds['large_dec']),
    (df['Quarterly_change_pct'] >= thresholds['large_dec']) & (df['Quarterly_change_pct'] < thresholds['moderate_dec']),
    (df['Quarterly_change_pct'] >= thresholds['moderate_dec']) & (df['Quarterly_change_pct'] < 0),

    # Positive classes
    (df['Quarterly_change_pct'] >= 0) & (df['Quarterly_change_pct'] < thresholds['moderate_inc']),
    (df['Quarterly_change_pct'] >= thresholds['moderate_inc']) & (df['Quarterly_change_pct'] < thresholds['large_inc']),
    (df['Quarterly_change_pct'] >= thresholds['large_inc'])
]

labels = [
    'large_decrease',
    'moderate_decrease',
    'small_decrease',
    'small_increase',
    'moderate_increase',
    'large_increase'
]

df['Trend_class'] = np.select(conditions, labels, default=np.nan)

In [107]:
numeric_features = ['Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']

for col in numeric_features:
    df[col] = pd.qcut(df[col], q=5, labels=False, duplicates='drop')

In [108]:
df = df.dropna().reset_index(drop=True)

In [109]:
print(df.isnull().sum())

Date                    0
Price                   0
Open                    0
High                    0
Low                     0
Vol.                    0
Change %                0
Year                    0
Day                     0
Day_of_week             0
Quarter                 0
Year_bin                0
Day_bin                 0
Price_future_quarter    0
Quarterly_change_pct    0
Direction_quarter       0
Trend_class             0
dtype: int64


In [110]:
df.columns

Index(['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %', 'Year',
       'Day', 'Day_of_week', 'Quarter', 'Year_bin', 'Day_bin',
       'Price_future_quarter', 'Quarterly_change_pct', 'Direction_quarter',
       'Trend_class'],
      dtype='object')

In [111]:
df = df.drop(columns=['Date', 'Year', 'Day', 'Price_future_quarter', 'Quarterly_change_pct'])

# Encoding

In [112]:
encoder = LabelEncoder()
df['Trend_class'] = encoder.fit_transform(df['Trend_class'])

print("Label Encoding Mapping:")
for i, class_name in enumerate(encoder.classes_):
    print(f"{class_name}: {i}")

Label Encoding Mapping:
large_decrease: 0
large_increase: 1
moderate_decrease: 2
moderate_increase: 3
small_decrease: 4
small_increase: 5


In [113]:
df.head()

Unnamed: 0,Price,Open,High,Low,Vol.,Change %,Day_of_week,Quarter,Year_bin,Day_bin,Direction_quarter,Trend_class
0,0,0,0,0,4,0,5,4,0,1,0,2
1,0,0,0,0,2,0,1,4,0,1,0,4
2,0,0,0,0,1,0,2,4,0,1,0,5
3,0,0,0,0,0,3,3,4,0,1,0,4
4,0,0,0,0,0,1,4,4,0,1,0,4


# Train Test Split

In [114]:
target_variables = ['Direction_quarter', 'Trend_class']

tscv = TimeSeriesSplit(n_splits=5)
splits = []
for train_idx, test_idx in tscv.split(df):
    train_set = df.iloc[train_idx]
    test_set = df.iloc[test_idx]
    splits.append((train_set, test_set))

final_train, final_test = splits[-1]

X_train = final_train[[col for col in final_train.columns if col not in target_variables]].values
y_train = final_train[target_variables].values
X_test = final_test[[col for col in final_test.columns if col not in target_variables]].values
y_test = final_test[target_variables].values

In [115]:
final_test.head()

Unnamed: 0,Price,Open,High,Low,Vol.,Change %,Day_of_week,Quarter,Year_bin,Day_bin,Direction_quarter,Trend_class
9041,4,4,4,4,0,1,2,4,4,1,1,5
9042,4,4,4,4,0,1,3,4,4,1,1,5
9043,4,4,4,4,0,1,4,4,4,1,1,5
9044,4,4,4,4,0,1,5,4,4,1,1,5
9045,4,4,4,4,0,2,1,4,4,2,1,5


In [116]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (9041, 10)
X_test shape: (1808, 10)
y_train shape: (9041, 2)
y_test shape: (1808, 2)


# Save data

In [117]:
df.to_csv("df.csv", index=False)
final_train.to_csv("final_train.csv", index=False)
final_test.to_csv("final_test.csv", index=False)

In [118]:
# Export training data
np.save("X_train.npy", X_train)
np.save("y_train.npy", y_train)
np.save("X_test.npy", X_test)
np.save("y_test.npy", y_test)