# Distillation
The knowledge of an elaborate, cumbersome model could be transferred to a smaller, more efficient model (faster decision-making with reduced computational costs), making it possible to deploy powerful AI in resource-constrained environments for example HFT.

Importing libraries

In [None]:
import numpy as np
import yfinance as yf
import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

## Data Processing

In [None]:
# Fetch stock data from Yahoo Finance
def fetch_data(stock_symbol, start_date, end_date):
    data = yf.download(stock_symbol, start=start_date, end=end_date)
    return data['Close'].values  # Using closing prices

# Preprocessing data to create features and labels
def create_features_labels(data, window_size):
    x = []
    y = []
    for i in range(len(data) - window_size):
        x.append(data[i:i+window_size])
        y.append(data[i+window_size] > data[i+window_size - 1])  # 1 if the price increased, else 0
    return np.array(x), np.array(y).astype(int)

# Define parameters
stock_symbol = 'MSFT'
start_date = '2023-01-01'
end_date = '2024-12-31'
window_size = 10  # Using 10 days of stock prices to predict next day trend

# Load and preprocess data
stock_data = fetch_data(stock_symbol, start_date, end_date)
x_train, y_train = create_features_labels(stock_data, window_size)

## Create the Teacher Model
The model, named teacher_model, is a sequential neural network that includes dense layers with ReLU activation functions, dropout layers for regularization to prevent overfitting, and a sigmoid activation function in the output layer to predict binary outcomes (price increase or decrease).

In [None]:
# Define the teacher model
teacher_model = Sequential([
    Dense(128, activation='relu', input_shape=(window_size,)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

teacher_model.compile(optimizer=Adam(learning_rate=0.001),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

# Train the teacher model
teacher_model.fit(x_train, y_train, epochs=20, batch_size=32)

## Define the Student Model
The student model will be a simpler version of the teacher model, with fewer parameters.

In [None]:
student_model = Sequential([
    Dense(32, activation='relu', input_shape=(10,)),
    Dense(1, activation='sigmoid')
])

student_model.compile(optimizer=Adam(learning_rate=0.001),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

## Distillation
Model distillation enables algorithmic traders to achieve high-performance trading strategies while optimizing for efficiency and flexibility in deployment, even on traditional computing resources like CPUs.

Softening Predictions (Logits: These are the raw outputs from the final layer of the teacher model) with Temperature Scaling by dividing logits with parameter 𝑇 before applying softmax.

When 𝑇>1, the softmax probabilities become more evenly distributed. If the teacher predicts [0.7, 0.3], this provides richer information for the student model, as it captures relationships between classes that hard labels (e.g., 0 or 1).

In [None]:
# Soften the outputs of the teacher model
temperature = 5.0  # Temperature hyperparameter
teacher_predictions = teacher_model.predict(x_train)
softened_teacher_predictions = k.nn.softmax(teacher_predictions / temperature)

# Train the student model using the teacher's softened outputs
student_model.compile(optimizer='adam',
                      loss='categorical_crossentropy',  # Use categorical crossentropy for softened labels
                      metrics=['accuracy'])

# Convert labels to categorical (since we are using softmax in teacher predictions)
y_train_categorical = k.keras.utils.to_categorical(y_train, num_classes=2)

student_model.fit(x_train, y_train_categorical, epochs=10, batch_size=32)