<a href="https://colab.research.google.com/github/anunknownpleasure/Pricing-assets-with-deep-learning/blob/main/Asset_pricing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Installing libraries


In [71]:
!pip install getFamaFrenchFactors



In [72]:
!pip install lxml



In [73]:
!pip install pandas_datareader



# 2. Data Import and preprocessing

## 2a. Importing Fama-French 5 factor data


In [74]:
import numpy as np
import pandas as pd
import getFamaFrenchFactors

print("Successfully imported getFamaFrenchFactors!")
print(dir(getFamaFrenchFactors))

Successfully imported getFamaFrenchFactors!


In [75]:
from getFamaFrenchFactors import famaFrench5Factor


# Get the factors
factors_df = famaFrench5Factor()

# Adjust from percentage
factors_df[['Mkt-RF', 'SMB', 'HML', 'RF']] = factors_df[['Mkt-RF', 'SMB', 'HML', 'RF']] / 100



In [76]:
factors_df.head(20)

Unnamed: 0,date_ff_factors,Mkt-RF,SMB,HML,RMW,CMA,RF
0,1963-07-31,-3.9e-05,-4.8e-05,-8.1e-05,0.0064,-0.0115,2.7e-05
1,1963-08-31,0.000508,-8e-05,0.00017,0.004,-0.0038,2.5e-05
2,1963-09-30,-0.000157,-4.3e-05,0.0,-0.0078,0.0015,2.7e-05
3,1963-10-31,0.000254,-0.000134,-4e-06,0.0279,-0.0225,2.9e-05
4,1963-11-30,-8.6e-05,-8.5e-05,0.000173,-0.0043,0.0227,2.7e-05
5,1963-12-31,0.000183,-0.000189,-2.1e-05,0.0012,-0.0025,2.9e-05
6,1964-01-31,0.000227,1e-05,0.000163,0.0021,0.0148,3e-05
7,1964-02-29,0.000155,3.3e-05,0.000281,0.0011,0.0081,2.6e-05
8,1964-03-31,0.000141,0.000141,0.000329,-0.0203,0.0298,3.1e-05
9,1964-04-30,1.1e-05,-0.000148,-5.4e-05,-0.0132,-0.0113,2.9e-05


We adjust the start date to -1-31-1964

In [77]:
FFdata = factors_df.iloc[6:]
FFdata['date_ff_factors'] = pd.to_datetime(FFdata['date_ff_factors'])
FFdata = FFdata.set_index('date_ff_factors')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FFdata['date_ff_factors'] = pd.to_datetime(FFdata['date_ff_factors'])


## 2b. Importing the macroeconomic data

In [78]:
import pandas_datareader.data as web

# --- 1. Define the 5 Long-History Macro Indicators (FRED Tickers) ---
long_macro_tickers = {
    'Term_Spread': 'T10YFFM',      # 10-Yr Yield minus Fed Funds Rate
    'Default_Spread': 'AAAFFM',     # Baa Corp Yield minus Aaa Corp Yield (Risk Aversion Proxy)
    'Ind_Production': 'INDPRO',     # Industrial Production Index
    'Unemployment': 'UNRATE',       # Civilian Unemployment Rate
    'Consumer_Sentiment': 'UMCSENT' # University of Michigan Consumer Sentiment
}

# --- 2. Define the Time Period
start_date = '1964-01-01'
end_date = pd.to_datetime('today').strftime('%Y-%m-%d')

# --- 3. Fetch Data from FRED ---
try:
    macro_data = web.DataReader(
        list(long_macro_tickers.values()),
        'fred',
        start=start_date,
        end=end_date
    )
    macro_data.columns = list(long_macro_tickers.keys())

except Exception as e:
    print(f"Error fetching data from FRED: {e}")
    macro_data = pd.DataFrame(index=pd.date_range(start_date, end_date, freq='M'))

# --- 4. Align Data to Monthly Frequency ---

# 4a. Forward-Fill any monthly gaps (common in macro data)
macro_data = macro_data.ffill()

# 4b. Ensure all data points are at the end of the month for clean alignment
macro_data = macro_data.resample('ME').last()


# --- 5. Display the result ---
print(f"Macro Data Imported and Aligned ({len(macro_data)} periods, starting {macro_data.index.min().strftime('%Y-%m')}):")
print(macro_data.head())
print("\n... and the tail:")
print(macro_data.tail())

# The resulting 'macro_data' DataFrame is ready for merging with Fama-French data.

Macro Data Imported and Aligned (741 periods, starting 1964-01):
            Term_Spread  Default_Spread  Ind_Production  Unemployment  \
DATE                                                                    
1964-01-31         0.69            0.91         27.7409           5.6   
1964-02-29         0.67            0.88         27.9291           5.4   
1964-03-31         0.79            0.95         27.9291           5.4   
1964-04-30         0.76            0.93         28.3861           5.3   
1964-05-31         0.70            0.91         28.5474           5.1   

            Consumer_Sentiment  
DATE                            
1964-01-31                 NaN  
1964-02-29                99.5  
1964-03-31                99.5  
1964-04-30                99.5  
1964-05-31                98.5  

... and the tail:
            Term_Spread  Default_Spread  Ind_Production  Unemployment  \
DATE                                                                    
2025-05-31         0.09    

In [79]:
macro_data.columns

Index(['Term_Spread', 'Default_Spread', 'Ind_Production', 'Unemployment',
       'Consumer_Sentiment'],
      dtype='object')

##2c. Importing the FF-portfolios

In [80]:
# --- 1. Define Time Period ---
# Must match the start date used for your FF factors and macro data (e.g., 1964-01-01)
start_date = '1964-01-01'
end_date = pd.to_datetime('today').strftime('%Y-%m-%d')

# --- 2. Fetch the 25 Portfolios (Size x Book-to-Market) ---
# The data is downloaded as a dictionary object
ff_portfolio = web.DataReader(
    '25_Portfolios_5x5',
    'famafrench',
    start=start_date,
    end=end_date
)

df_returns_25 = ff_portfolio[0]
print(df_returns_25.shape)
df_returns_25 = df_returns_25.replace([-99.99, -999], np.nan) # Missing values are indicated by -99.99 or -999
df_returns_25.dropna(inplace=True)
print(df_returns_25.shape)
df_returns_25 = (df_returns_25/100)# Converting from percentage to fraction

# Convert PeriodIndex to DatetimeIndex at the end of the month
df_returns_25.index = df_returns_25.index.to_timestamp(how = 'end').date




  ff_portfolio = web.DataReader(
  ff_portfolio = web.DataReader(
  ff_portfolio = web.DataReader(
  ff_portfolio = web.DataReader(
  ff_portfolio = web.DataReader(
  ff_portfolio = web.DataReader(
  ff_portfolio = web.DataReader(


(740, 25)
(740, 25)


  ff_portfolio = web.DataReader(
  ff_portfolio = web.DataReader(
  ff_portfolio = web.DataReader(


## 2d. Combining all the data into a DataFrame

In [81]:

# Making the index into a datetime object
macro_data.index = pd.to_datetime(macro_data.index)


# Combine the dataframes using merge on the index

combined_data_FF_macro = pd.merge(FFdata, macro_data, left_index=True, right_index=True, how='inner') # Combining FF and Macro. Dropping first row because of a null value

combined_data = pd.merge(combined_data_FF_macro, df_returns_25, left_index=True, right_index=True, how='inner')

combined_data.dropna(inplace=True) # Dropping NaN entries

# Display the combined data
print("Combined Data:")
display(combined_data.head())
print("\n... and the tail:")
display(combined_data.tail())

Combined Data:


Unnamed: 0,Mkt-RF,SMB,HML,RMW,CMA,RF,Term_Spread,Default_Spread,Ind_Production,Unemployment,...,ME4 BM1,ME4 BM2,ME4 BM3,ME4 BM4,ME4 BM5,BIG LoBM,ME5 BM2,ME5 BM3,ME5 BM4,BIG HiBM
1964-02-29,0.000155,3.3e-05,0.000281,0.0011,0.0081,2.6e-05,0.67,0.88,27.9291,5.4,...,0.025943,0.015619,0.028444,0.072047,0.046121,0.018271,0.005232,0.010194,0.039989,0.037567
1964-03-31,0.000141,0.000141,0.000329,-0.0203,0.0298,3.1e-05,0.79,0.95,27.9291,5.4,...,0.01775,0.029767,0.052497,0.071287,0.007247,0.011575,0.007635,0.036237,0.038382,0.001491
1964-04-30,1.1e-05,-0.000148,-5.4e-05,-0.0132,-0.0113,2.9e-05,0.76,0.93,28.3861,5.3,...,-0.027045,0.003434,0.019784,-0.026384,-0.022805,0.002272,0.014745,0.008082,-0.009054,0.024147
1964-05-31,0.000141,-6.2e-05,0.000181,-0.0015,0.0013,2.6e-05,0.7,0.91,28.5474,5.1,...,0.011914,0.022992,0.013559,0.013281,0.04099,0.020599,0.003304,0.011776,0.042859,0.033968
1964-06-30,0.000127,1.3e-05,6.8e-05,-0.0033,0.001,3e-05,0.67,0.91,28.628,5.2,...,0.010927,0.014771,0.011035,0.024965,0.031119,0.009744,0.028644,0.004437,0.013682,0.024217



... and the tail:


Unnamed: 0,Mkt-RF,SMB,HML,RMW,CMA,RF,Term_Spread,Default_Spread,Ind_Production,Unemployment,...,ME4 BM1,ME4 BM2,ME4 BM3,ME4 BM4,ME4 BM5,BIG LoBM,ME5 BM2,ME5 BM3,ME5 BM4,BIG HiBM
2025-04-30,-8.4e-05,-0.000186,-0.00034,-0.0285,-0.0267,3.5e-05,-0.05,1.12,103.6224,4.2,...,-0.008766,-0.012699,-0.020146,-0.039276,-0.072668,0.014106,-0.030129,-0.073867,-0.013472,-0.027941
2025-05-31,0.000606,-7.2e-05,-0.000288,0.0126,0.0251,3.8e-05,0.09,1.21,103.657,4.2,...,0.062577,0.050222,0.035353,0.081175,0.065826,0.078077,0.061296,0.018407,0.026156,0.065684
2025-06-30,0.000486,-2e-06,-0.00016,-0.0319,0.0145,3.4e-05,0.05,1.13,104.2115,4.1,...,0.020351,0.043192,0.024175,0.073815,0.058024,0.055279,0.062451,0.047405,0.036424,0.070109
2025-07-31,0.000198,-1.5e-05,-0.000127,-0.0029,-0.0207,3.4e-05,0.06,1.12,103.8194,4.2,...,0.034206,0.021766,0.009013,-0.001069,-0.019303,0.032949,0.014068,0.012224,0.002333,-0.013744
2025-08-31,0.000185,0.000488,0.000441,-0.0069,0.0207,3.8e-05,-0.07,1.02,103.9203,4.3,...,0.036524,0.016164,0.023481,0.058848,0.071342,0.01161,0.011927,0.030567,0.054257,0.090799


In [82]:
combined_data.columns

Index(['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF', 'Term_Spread',
       'Default_Spread', 'Ind_Production', 'Unemployment',
       'Consumer_Sentiment', 'SMALL LoBM', 'ME1 BM2', 'ME1 BM3', 'ME1 BM4',
       'SMALL HiBM', 'ME2 BM1', 'ME2 BM2', 'ME2 BM3', 'ME2 BM4', 'ME2 BM5',
       'ME3 BM1', 'ME3 BM2', 'ME3 BM3', 'ME3 BM4', 'ME3 BM5', 'ME4 BM1',
       'ME4 BM2', 'ME4 BM3', 'ME4 BM4', 'ME4 BM5', 'BIG LoBM', 'ME5 BM2',
       'ME5 BM3', 'ME5 BM4', 'BIG HiBM'],
      dtype='object')

In [83]:
FF_columns = combined_data.columns[:6]
macro_columns = combined_data.columns[6:11]
portfolio_columns = combined_data.columns[11:]

no_of_FF_features = len(FF_columns)
no_macro_features = len(macro_columns)
no_of_portfolios = len(portfolio_columns)




## 2e. Data Processing

In [84]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


processed_data = scaler.fit_transform(combined_data)
processed_data[:5]

processed_data.shape


(739, 36)

We create rolling windows for LSTM input

In [85]:
# Function to create rolling windows on a timeseries data

def rolling_window(data, lookback):
  x_rolling = []
  for i in range(len(data) - lookback):
    x_rolling.append(data[i: i + lookback])

  return np.array(x_rolling)

lookback = 12
ff_data = processed_data[:, :no_of_FF_features]
macro_data = processed_data[:, no_of_FF_features: no_of_FF_features + no_macro_features]
portfolio_data = processed_data[:, no_of_FF_features + no_macro_features:]

# Creating rolling window on macro data
X_macro_rolled = rolling_window(macro_data, lookback)

# Aligning the rolling data with FF factor and Portfolio data
X_ff_aligned = ff_data[lookback:]
Y_targets_aligned = portfolio_data[lookback:]




In [101]:
# Creating Dataset

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset # Import Subset

class AssetPricingDataset(Dataset):

    def __init__(self, macro_data, ff_data, target_data):
        # 1. Convert all data to PyTorch tensors
        #    We use .float() for all data as they are continuous variables.
        self.X_macro = torch.tensor(macro_data).float()
        self.X_ff = torch.tensor(ff_data).float()
        self.Y_targets = torch.tensor(target_data).float()

    def __len__(self):
        return len(self.X_macro)

    def __getitem__(self, idx):
        return {
            'macro_X': self.X_macro[idx],    # Shape: [12, 5]
            'ff_X': self.X_ff[idx],          # Shape: [6]
            'target_Y': self.Y_targets[idx]  # Shape: [25]
        }

data = AssetPricingDataset(X_macro_rolled, X_ff_aligned, Y_targets_aligned)
no_of_samples = len(data)
train_size = int(len(data)*0.8)

# Use Subset to create train and test datasets
indices = list(range(no_of_samples))
train_indices = indices[:train_size]
test_indices = indices[train_size:]

train_data = Subset(data, train_indices)
test_data = Subset(data, test_indices)


BATCH_SIZE = 64

train_dataset = DataLoader(dataset = train_data, batch_size=BATCH_SIZE, shuffle = True, drop_last = True)
test_dataset = DataLoader(dataset = test_data, batch_size=BATCH_SIZE, shuffle = False, drop_last = False)

# **Model**

## Generator

In [87]:
class Generator(nn.Module):
  def __init__(self, macro_dim, hidden_dim, lstm_layers, sdf_dim):
    super().__init__()
    # Define the LSTM layer for processing macro data
    self.lstm = nn.LSTM(input_size= macro_dim, hidden_size=hidden_dim, num_layers=lstm_layers, batch_first=True)

    # Define the Linear layer to output the SDF
    # The input size to this layer should combine the output of the LSTM and the FF factors
    self.linear = nn.Linear(in_features=hidden_dim, out_features=sdf_dim)

  def forward(self, macro_X, ff_X):
    _, (h_n, c_n) = self.lstm(macro_X)
    h_t = h_n[-1]

    f_t = self.linear(h_t)



    return f_t, h_t



## Discriminator

In [88]:
import torch.nn.functional as F

class Discriminator(nn.Module):

    def __init__(self, hidden_dim, num_assets, hidden_layer):
        # hidden_dim is the size of h_t (e.g., 8)
        # num_assets is the size of the output portfolio weights g_t (25)
        super(Discriminator, self).__init__()

        # Layer 1: Maps the state vector (8) to a richer feature space (16)
        self.fc1 = nn.Linear(in_features=hidden_dim, out_features = hidden_layer)

        # Layer 2: Maps the features (16) to the final portfolio weights (25)
        self.fc2 = nn.Linear(in_features = hidden_layer, out_features=num_assets)

    def forward(self, h_t):

        x = F.relu(self.fc1(h_t))
        g_t = self.fc2(x)

        return g_t


In [89]:
# Loss function

def calculate_SDF(f_t, ff_X):
  ones = torch.ones(ff_X.shape[0], 1)
  ff_aug = torch.cat([ones, ff_X], dim=1)
  sdf_m = (f_t * ff_aug).sum(dim=1)

  return sdf_m

def discriminator_loss(f_t, g_t, ff_X, target_Y):
  SDF = calculate_SDF(f_t, ff_X)
  portfolio_returns = (g_t * target_Y).sum(dim=1)
  pricing_error_of_sample = SDF*portfolio_returns

  loss = - torch.abs(pricing_error_of_sample.mean())

  return loss

def generator_loss(f_t, g_t, ff_X, target_Y):
  SDF = calculate_SDF(f_t, ff_X)
  portfolio_returns = (g_t * target_Y).sum(dim=1)
  pricing_error_of_sample = SDF*portfolio_returns

  loss = torch.abs(pricing_error_of_sample.mean())

  return loss

In [90]:
len(train_dataset)


0

In [91]:
# Training loop:
import torch.optim as optim

macro_dim = 5
no_of_assets = 25
sdf_dim = 7

#Hyperparameters
hidden_dim = 8 # Generator hidden dimension
lstm_layers = 2
hidden_layer = 16 # Discriminator hidden dimension

generator = Generator(macro_dim, hidden_dim, lstm_layers, sdf_dim)
discriminator = Discriminator(hidden_dim, no_of_assets, hidden_layer)

D_learning_rate = 1e-4
G_learning_rate = 1e-4
D_optim = optim.Adam(discriminator.parameters(), lr = D_learning_rate)
G_optim = optim.Adam(generator.parameters(), lr = G_learning_rate)
epochs = 10

for epoch in range(epochs):
  D_loss = None # Initialize D_loss for the epoch
  G_loss = None # Initialize G_loss for the epoch
  for batch_idx, batch in enumerate(train_dataset):

    # Training the discriminator on data

    macro_X = batch['macro_X']
    ff_X = batch['ff_X']
    target_Y = batch['target_Y']

    D_optim.zero_grad()
    f_t, h_t = generator.forward(macro_X, ff_X)
    g_t = discriminator.forward(h_t.detach())

    D_loss = discriminator_loss(f_t.detach(), g_t, ff_X, target_Y)
    D_loss.backward()
    D_optim.step()

    # Training the generator

    G_optim.zero_grad()
    f_t, h_t = generator(macro_X, ff_X)
    g_t = discriminator(h_t) # D's weights are now fixed/frozen, but its output guides G

    # Assume loss_generator is defined to MINIMIZE the error (|Error|)
    G_loss= generator_loss(f_t, g_t, ff_X, target_Y)

    # F. Backward Pass & Update G
    G_loss.backward()
    G_optim.step()

  # Check if D_loss and G_loss were assigned in the inner loop before printing
  if D_loss is not None and G_loss is not None:
    print(f'Epoch [{epoch+1}/{epochs}], Loss D: {D_loss.item():.6f}, Loss G: {G_loss.item():.6f}')
  else:
    print(f'Epoch [{epoch+1}/{epochs}]: No batches processed in this epoch.')

Epoch [1/10]: No batches processed in this epoch.
Epoch [2/10]: No batches processed in this epoch.
Epoch [3/10]: No batches processed in this epoch.
Epoch [4/10]: No batches processed in this epoch.
Epoch [5/10]: No batches processed in this epoch.
Epoch [6/10]: No batches processed in this epoch.
Epoch [7/10]: No batches processed in this epoch.
Epoch [8/10]: No batches processed in this epoch.
Epoch [9/10]: No batches processed in this epoch.
Epoch [10/10]: No batches processed in this epoch.
