In [52]:
import numpy as np
import pandas as pd
import datetime

from sklearn import model_selection

import tensorflow as tf
import tensorflow.keras as keras


In [53]:
data_dir = '/Users/anishkahc/Desktop/Docs/ML Fall 23/Data/'

openap_file = 'OpenAP_Macro.parquet.gzip'
openap = pd.read_parquet( data_dir + openap_file)

#Code from HW03
# filter data from year 2000 onwards
openap = openap[pd.DatetimeIndex(openap['DateYM']).year >= 2000]
openap.drop(openap.iloc[:,-10:-1], inplace = True, axis = 1)
openap.set_index(keys = ['DateYM', 'permno'], inplace = True, verify_integrity = True)
#rank
cols = openap.columns[:-1]
openap[cols] = openap.groupby('DateYM')[cols].rank(pct=True).subtract(0.5).multiply(2)

In [54]:
#Mapping onto [-1,1] interval
openap.iloc[:, 0:-1] = openap.iloc[:, 0:-1] * 2
openap.iloc[:, 0:-1] = openap.iloc[:, 0:-1] - 1

In [55]:
openap = openap.groupby('DateYM').transform(lambda x: x.fillna(x.median()))
openap.fillna(0, inplace=True)
openap.info(verbose = True, show_counts = True)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1570968 entries, (Timestamp('2000-01-31 00:00:00'), 10001) to (Timestamp('2020-12-31 00:00:00'), 93436)
Data columns (total 207 columns):
 #    Column                     Non-Null Count    Dtype  
---   ------                     --------------    -----  
 0    AbnormalAccruals           1570968 non-null  float64
 1    Accruals                   1570968 non-null  float64
 2    AccrualsBM                 1570968 non-null  float64
 3    Activism1                  1570968 non-null  float64
 4    Activism2                  1570968 non-null  float64
 5    AdExp                      1570968 non-null  float64
 6    AgeIPO                     1570968 non-null  float64
 7    AM                         1570968 non-null  float64
 8    AnalystRevision            1570968 non-null  float64
 9    AnalystValue               1570968 non-null  float64
 10   AnnouncementReturn         1570968 non-null  float64
 11   AOP                        1570968 non-

In [56]:
tf.keras.utils.set_random_seed(3462)
tf.config.experimental.enable_op_determinism()

In [57]:
tf.keras.utils.set_random_seed(3462)
inputs = keras.Input(shape=(206,))
d = keras.layers.Dense(units=32, activation='relu')(inputs)
d1 = keras.layers.Dense(units=32, activation='relu')(d)
d2 = keras.layers.Dense(units=32, activation='relu')(d1)
outputs = keras.layers.Dense(units=1)(d2)

# Define the Model
model = keras.Model(inputs=inputs, outputs=outputs, name='3Layer_Perceptron')
model.summary()

Model: "3Layer_Perceptron"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 206)]             0         
                                                                 
 dense_26 (Dense)            (None, 32)                6624      
                                                                 
 dense_27 (Dense)            (None, 32)                1056      
                                                                 
 dense_28 (Dense)            (None, 32)                1056      
                                                                 
 dense_29 (Dense)            (None, 1)                 33        
                                                                 
Total params: 8769 (34.25 KB)
Trainable params: 8769 (34.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [58]:
# Compile the Model
model.compile(loss='mse',
              metrics=keras.metrics.MeanAbsoluteError(),
              optimizer=keras.optimizers.legacy.Adam(learning_rate=0.001)
              )

In [59]:
# early stopping callback
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [60]:
year=[]
r2_result = []
predictions_df = pd.DataFrame(columns=['permno', 'DateYM', 'Y_true', 'Y_pred'])

n = 10

train_start = 2000
train_end = 2005
validation_end = 2008
test_end = 2009
    
for i in range(n):
    print("Training year ", i + 2009) 
    year.append(i + 2009)

    train = openap[(openap.index.get_level_values('DateYM').year >= train_start) & (openap.index.get_level_values('DateYM').year <= train_end+i)].copy()
    val = openap[(openap.index.get_level_values('DateYM').year >= train_end) & (openap.index.get_level_values('DateYM').year <= validation_end+i)].copy()
    test = openap[(openap.index.get_level_values('DateYM').year > validation_end+i) & (openap.index.get_level_values('DateYM').year <= test_end+i)].copy()
    
    train.sort_index(inplace=True)
    test.sort_index(inplace=True)
        
    #Create a flag 'test_fold'
    train = train.assign(test_fold = -1)
    train.loc[pd.DatetimeIndex(train.index.get_level_values('DateYM')).year > train_end+i, 'test_fold'] = 0
    val = val.assign(test_fold = -1)
    val.loc[pd.DatetimeIndex(val.index.get_level_values('DateYM')).year > train_end+i, 'test_fold'] = 0
    
#     assert train[train.index.get_level_values('DateYM').year>train_end+i].test_fold.min() == 0
#     assert train[train.index.get_level_values('DateYM').year>train_end+i].test_fold.max() == 0
        
    # Note the callback and the validation set

    history = model.fit(train.drop(columns=[ 'test_fold', 'retadj']),train['retadj'],
                  epochs=100,
                  batch_size= 64,
                  verbose=2,
                  validation_data=(val.drop(columns=[ 'test_fold', 'retadj']), val['retadj']),
                  callbacks=[early_stopping_cb]
                 )
    
    Y_train = model.predict(X=train.drop(columns=[ 'test_fold', 'retadj']))
    Y_test = model.predict(X=test.drop(columns=['retadj']))
    
    # Calculate R^2
    r2_train = R2_oos(Y_train, Y_train)
    print('Training R2:', r2_train)
    
    r2_test = metrics.r2_score(test['retadj'], Y_test)
    print('Test R2:', r2_test)
    
    r2_result.append((i+2009, r2_train, r2_test))
    
    #Referred to Ashutosh's code for saving the prediction values
    predictions = pd.DataFrame({
        'permno': test.index.get_level_values('permno'),
        'DateYM': test.index.get_level_values('DateYM'),
        'y_true': test['retadj'],
        'y_pred': Y_test,
    })
    
    predictions_df = pd.concat([predictions_df,predictions], ignore_index=True)  


Training year  2009
Epoch 1/100
8146/8146 - 5s - loss: 0.0416 - mean_absolute_error: 0.1186 - val_loss: 0.0213 - val_mean_absolute_error: 0.0909 - 5s/epoch - 660us/step
Epoch 2/100
8146/8146 - 5s - loss: 0.0398 - mean_absolute_error: 0.1159 - val_loss: 0.0219 - val_mean_absolute_error: 0.0936 - 5s/epoch - 605us/step
Epoch 3/100
8146/8146 - 5s - loss: 0.0390 - mean_absolute_error: 0.1152 - val_loss: 0.0214 - val_mean_absolute_error: 0.0914 - 5s/epoch - 650us/step
Epoch 4/100
8146/8146 - 6s - loss: 0.0387 - mean_absolute_error: 0.1148 - val_loss: 0.0221 - val_mean_absolute_error: 0.0940 - 6s/epoch - 702us/step
Epoch 5/100
8146/8146 - 5s - loss: 0.0385 - mean_absolute_error: 0.1147 - val_loss: 0.0221 - val_mean_absolute_error: 0.0942 - 5s/epoch - 616us/step
Epoch 6/100


KeyboardInterrupt: 

In [61]:
# Got this code from chatGPT. Looked through the documentation too. 
# https://psutil.readthedocs.io/en/latest/
# https://docs.python.org/3/library/platform.html

import platform
import os
import psutil

# CPU Type and Number of Cores
cpu_type = platform.processor()
cpu_count = os.cpu_count()

# RAM
ram_info = psutil.virtual_memory()
total_ram = ram_info.total / (1024 ** 3)  # Convert to GB

print(f"CPU Type: {cpu_type}")
print(f"Number of Cores: {cpu_count}")
print(f"Total RAM (GB): {total_ram:.2f}")


CPU Type: arm
Number of Cores: 10
Total RAM (GB): 16.00
