# FIT5149 - Applied Data Analysis: Assignment 1 (Monthly Stock Prediction: Prediction)

This notebook covers the final prediction part of the assignment using the trained ensemble model.

In [1]:
from pathlib import Path
import pickle

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

Load and prepare the data

In [2]:
def load_data(data_dir="data"):
    # Primary data
    primary_data_dir = Path(data_dir)
    stock = pd.read_csv(primary_data_dir/"stock_data.csv")
    company = pd.read_csv(primary_data_dir/"company_info.csv")
    index = pd.read_csv(primary_data_dir/"monashIndex.csv")
    train_targets = pd.read_csv(primary_data_dir/"training_targets.csv")

    # Optional data
    optional_data_dir = primary_data_dir/"optional_data"
    optional_data_list = []
    for fname in optional_data_dir.glob("*.csv"):
        optional_data_list.append(pd.read_csv(fname))
    
    return stock, company, index, train_targets, optional_data_list

In [3]:
stock, company, index, train_targets, optional_data_list = load_data()

In [4]:
def merge_data(stock, company, index, train_targets, optional_data_list=None):
    # Add the test data to predict for month_id "2023_07"
    next_month_df = pd.DataFrame({"month_id": ["2023_07"]*len(company), "stock_id": company["stock_id"].values})
    new_stock = pd.concat([stock, next_month_df])
    
    # Merge/join the primary data
    df = pd.merge(new_stock, company, on=["stock_id"], how="left")
    df = pd.merge(df, index, on=["month_id"], how="left")
    df = pd.merge(df, train_targets, on=["month_id", "stock_id"], how="left")

    # Optionally merge the optional data
    if optional_data_list:
        for optional_data in optional_data_list:
            df = pd.merge(df, optional_data, on=["month_id"], how="left")
    
    return df

Do not use any optional features (see our modelling experiment notebook).

In [5]:
df = merge_data(stock, company, index, train_targets, optional_data_list=None)

Imput missing columns and build rolling/window features.

In [6]:
def impute_missing_columns(df, imputation_method):    
    for column in df.columns:
        if column in ["month_id", "stock_id"]:  # never impute ID columns
            continue
        
        if imputation_method == "ffill":
            df[column] = df[column].fillna(method="ffill")
        elif imputation_method == "median":
            df[column] = df[column].fillna(df[column].median())
    
    return df

def compute_rolling_statistic_features(df, macro_cols):
    grouped = df.groupby('stock_id')

    # Price-based returns 
    for window in [3, 6, 12]:
        df[f'intramonth_return_rolling_mean_{window}m']   = grouped['intramonth_return'].transform(lambda x: x.rolling(window, min_periods=1).mean())
        df[f'intramonth_return_rolling_std_{window}m']    = grouped['intramonth_return'].transform(lambda x: x.rolling(window, min_periods=1).std())
        df[f'intramonth_return_rolling_median_{window}m'] = grouped['intramonth_return'].transform(lambda x: x.rolling(window, min_periods=1).median())

    # Lagged returns 
    for col in ['return_1m', 'return_3m', 'return_6m']:
        for lag in [1, 2]:
            df[f'{col}_lagged_{lag}'] = grouped[col].shift(lag)

    # Volatility features
    for col, windows in [
        ('intramonth_volatility', [3, 6, 12]),
        ('volatility_3m', [3, 6]),
        ('volatility_6m', [3, 6])
    ]:
        for window in windows:
            df[f'{col}_rolling_mean_{window}m'] = grouped[col].transform(lambda x: x.rolling(window, min_periods=1).mean())
            df[f'{col}_rolling_std_{window}m']  = grouped[col].transform(lambda x: x.rolling(window, min_periods=1).std())

    # Volume features
    for col, windows in [
        ('monthly_volume', [3, 6]),
        ('avg_volume_3m', [3, 6]),
        ('volume_ratio', [3, 6])
    ]:
        for window in windows:
            df[f'{col}_rolling_mean_{window}m'] = grouped[col].transform(lambda x: x.rolling(window, min_periods=1).mean())
            df[f'{col}_rolling_sum_{window}m']  = grouped[col].transform(lambda x: x.rolling(window, min_periods=1).sum())

    # Price range
    df['price_range'] = df['month_high_usd'] - df['month_low_usd']
    for window in [3, 6, 12]:
        df[f'price_range_rolling_mean_{window}m']       = grouped['price_range'].transform(lambda x: x.rolling(window, min_periods=1).mean())
        df[f'price_range_ratio_rolling_mean_{window}m'] = grouped['price_range_ratio'].transform(lambda x: x.rolling(window, min_periods=1).mean())

    # Index rolling features
    for window in [3, 6]:
        df[f'index_return_rolling_mean_{window}m'] = grouped['index_return'].transform(lambda x: x.rolling(window, min_periods=1).mean())
        df[f'index_return_rolling_std_{window}m']  = grouped['index_return'].transform(lambda x: x.rolling(window, min_periods=1).std())

    # Additional features
    if macro_cols:
        for col in macro_cols:
            for window in [3, 6, 12]:
                df[f'{col}_rolling_mean_{window}m'] = df.groupby('month_id')[col].transform(lambda x: x.rolling(window, min_periods=1).mean())
                df[f'{col}_rolling_std_{window}m']  = df.groupby('month_id')[col].transform(lambda x: x.rolling(window, min_periods=1).std())

    return df
    
def handle_missing_rolling_features(df, strategy="drop", fill_value=0):
    rolling_cols = [c for c in df.columns if "lag" in c or "rolling" in c]
    if strategy == "drop":
        df = df.dropna(subset=rolling_cols)
    elif strategy == "fill":
        df[rolling_cols] = df[rolling_cols].fillna(fill_value)
    elif strategy == "ffill":
        df[rolling_cols] = df[rolling_cols].fillna(method="ffill")
    elif strategy == "bfill":
        df[rolling_cols] = df[rolling_cols].fillna(method="bfill")

    return df

def build_training_data(df, imputation_method, macro_cols=None,
                        build_additional_features=False, handle_missing_rolling_features_strategy="bfill"):
    train_df = df.copy()
    train_df = impute_missing_columns(train_df, imputation_method=imputation_method)
    train_df = train_df.sort_values(by=["stock_id", "month_id"])
    
    if build_additional_features:
        train_df = compute_rolling_statistic_features(train_df, macro_cols)
        train_df = handle_missing_rolling_features(train_df, strategy=handle_missing_rolling_features_strategy)
    
    train_df["outperform_binary"] = df["outperform_binary"]
    train_df["excess_return"] = df["excess_return"]
    
    return train_df

In [7]:
train_df = build_training_data(df, imputation_method="ffill",
                               macro_cols=None, 
                               build_additional_features=True,
                               handle_missing_rolling_features_strategy="bfill")

  df[column] = df[column].fillna(method="ffill")
  df[rolling_cols] = df[rolling_cols].fillna(method="bfill")


Split between training and test data. We will be using the test data only this time.

In [8]:
def split_data(df, task):
    train_df = df[df['month_id'] <= "2023_06"]
    test_df = df[df['month_id'] == "2023_07"]

    X_train = train_df.drop(columns=["outperform_binary", "excess_return"], axis=1)
    X_test = test_df.drop(columns=["outperform_binary", "excess_return"], axis=1)
    if task == "classification":
        y_train = train_df['outperform_binary'].astype(int)
    elif task == "regression":
        y_train = train_df['excess_return'].astype(float)

    return X_train, y_train, X_test

In [9]:
X_train_clf, y_train_clf, X_test_clf = split_data(train_df, task="classification")

Define the `TopKFeatureSelector` class as the top feature selector pipeline step.

In [10]:
class TopKFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names, importances, k=20):
        self.feature_names = feature_names
        self.importances = importances
        self.k = k
        self.selected_idx = None

    def fit(self, X, y=None):
        ranking = np.argsort(self.importances)[::-1]
        self.selected_idx = ranking[:self.k]
        return self

    def transform(self, X):
        return X[:, self.selected_idx]

    def get_feature_names_out(self, input_features=None):
        return [self.feature_names[i] for i in self.selected_idx]

Predict test data for `outperform_binary` and `excess_return` and save the prediction results.

In [11]:
with open("files/models/classification/final_voting_classifier.pkl", 'rb') as file:
    clasification_ensemble_pipeline = pickle.load(file)
with open("files/models/regression/final_voting_classifier.pkl", 'rb') as file:
    regression_ensemble_pipeline = pickle.load(file)

submission_df = X_test_clf[["stock_id", "month_id"]]
submission_df["outperform_binary"] = clasification_ensemble_pipeline.predict(X_test_clf)
submission_df["excess_return"] = regression_ensemble_pipeline.predict(X_test_clf)
submission_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df["outperform_binary"] = clasification_ensemble_pipeline.predict(X_test_clf)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df["excess_return"] = regression_ensemble_pipeline.predict(X_test_clf)


Unnamed: 0,stock_id,month_id,outperform_binary,excess_return
25618,US001,2023_07,0,0.000332
25619,US002,2023_07,0,-0.002541
25620,US003,2023_07,0,0.011159
25621,US004,2023_07,1,0.015339
25622,US005,2023_07,1,0.012703


In [12]:
kaggle_submission_df = submission_df[["stock_id", "excess_return"]]
submission_df.to_csv("files/submission/testing_targets.csv", index=False)
kaggle_submission_df.to_csv("files/submission/kaggle_submission.csv", index=False)