In [1]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

# Change working directory
%cd /content/drive/MyDrive/Colab Notebooks/Bachelor Thesis/Notebooks/Data

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Bachelor Thesis/Notebooks/Data


# Data import

Missingness was addressed in the previous notebook. Here data is preprocessed further (outlier detection and scaling).

To do so, data is first imported.

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Import data
data_stock_returns = pd.read_parquet('handoffs/data_stock_returns_cleaned.parquet')

# Import primary features
data_label_clusters = pd.read_csv("https://raw.githubusercontent.com/bkelly-lab/ReplicationCrisis/master/GlobalFactors/Cluster%20Labels.csv")

We have changed the primary features a bit, so these are added below.

In [6]:
# Primary features
list_primary_features = data_label_clusters['characteristic'].tolist()

# Add additional primary features
vars_add = ['me', 'me_company', 'ff49']

# Add variables and remove redundant variables from list
list_primary_features = list(set(list_primary_features).union(vars_add))
list_primary_features = list(set(data_stock_returns.columns).intersection(list_primary_features))

We note that `ff49` is an integer categorical value, so we do not clean this variable like the other (we later use hot one encodement, and avoid running outlier detection and scaling on this variable). The features to clean are retrived below.

In [7]:
features_to_clean = [feat for feat in list_primary_features if feat != 'ff49']

In [8]:
# Remove unnessecary variables
del data_label_clusters, vars_add

# Outlier detection

We use winzoring to remove outliers (*preprossing is not the goal of the BA, so we keep it breif*).

In [9]:
from scipy.stats.mstats import winsorize

In [10]:
# Define winsorization function for a series
def winsorize_series(series, lower=0.025, upper=0.025):
    return winsorize(series, limits=[lower, upper])

# Ensure correct dtype for columns with Float64 type
data_stock_returns = data_stock_returns.astype({
    col: np.float64 for col in data_stock_returns.select_dtypes(include=["Float64"]).columns
})

# Apply winsorization to features_to_clean by grouping over 'eom'
data_stock_returns[features_to_clean] = data_stock_returns.groupby("eom")[features_to_clean].transform(
    lambda x: winsorize_series(x)
)

# Scaling features

As proscribed in the Machine Learning for Factor Investments book, features are scalled on a day-by-day and feature-by-feature basis. We use the uniform scalar to get values between -1 and 1.

In [11]:
from sklearn.preprocessing import QuantileTransformer
import warnings

In [12]:
# Helper function for scaling
def scale_series(series):
    with warnings.catch_warnings():
        # catch warnings
        warnings.filterwarnings("ignore", message=".*n_quantiles.*greater than the total number of samples.*")

        # Initialize transformer
        qt = QuantileTransformer(output_distribution='uniform')

        # Scales each
        transformed = qt.fit_transform(series.values.reshape(-1, 1)).flatten()
    return transformed

# Apply scaling
data_stock_returns[features_to_clean] = data_stock_returns.groupby("eom")[features_to_clean].transform(
    lambda x: scale_series(x)
)

# Encodement

`ff49` is an integer categorical value. Hot one encodement is thus used to store data. This is done below.

In [13]:
# Import one hot encoder
from sklearn.preprocessing import OneHotEncoder

In [14]:
# Init encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Encoded dataframe
hot_one_data = pd.DataFrame(
    encoder.fit_transform(data_stock_returns[['ff49']]),
    columns=encoder.get_feature_names_out(['ff49']),
    index=data_stock_returns.index
)

# Join dataframes
data_stock_returns = pd.concat(
    [data_stock_returns.drop(columns=['ff49']), hot_one_data],
    axis=1
)

In the end we have the following amount of data

In [16]:
data_shape = data_stock_returns.shape
print(f"Final data frame shape: {data_shape}")

Final data frame shape: (225120, 198)


# Save data

Finally, the data is saved.

In [18]:
data_stock_returns.to_parquet("/content/drive/MyDrive/Colab Notebooks/Bachelor Thesis/Notebooks/Data/handoffs/data_stock_returns_final.parquet", index=False)