In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


In [2]:
import sys
print(sys.executable)

C:\Users\Andy\.conda\envs\option_project\python.exe


In [3]:
###Create function for model generation and run on all models

def createNN_Model_V4(model_name, training_input, training_outcomes, positive_weight, epoch_number, batch_number, pred_cutoff, actual_changes=None):
    # Ensure reproducibility
    np.random.seed(42)

    print(f"Running binary classification model on: {model_name}")
    # Check for CUDA and set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    X = torch.tensor(np.array(training_input.T), dtype=torch.float32)
    y = torch.tensor(np.array(training_outcomes), dtype=torch.float32).reshape(-1, 1)

    # Move data to the chosen device
    X = X.to(device)
    y = y.to(device)
    class PimaClassifier(nn.Module):
        def __init__(self):
            super().__init__()
            self.hidden1 = nn.Linear(training_input.shape[0], 64)
            self.act1 = nn.ReLU()
            self.hidden2 = nn.Linear(64, 12)
            self.act2 = nn.ReLU()
            self.hidden3 = nn.Linear(12, 8)
            self.act3 = nn.ReLU()
            self.output = nn.Linear(8, 1)
            self.act_output = nn.Sigmoid()
     
        def forward(self, x):
            x = self.act1(self.hidden1(x))
            x = self.act2(self.hidden2(x))
            x = self.act3(self.hidden3(x))
            x = self.act_output(self.output(x))
            return x

    temp_model = PimaClassifier().to(device)
    print(temp_model)

    pos_weight = torch.tensor([positive_weight], device=device)  # Set your_pos_weight based on your needs
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(temp_model.parameters(), lr=0.001)

    n_epochs = epoch_number
    batch_size = batch_number
    best_loss = float('inf')

    for epoch in range(n_epochs):
        for i in range(0, len(X), batch_size):
            Xbatch = X[i:i+batch_size]
            y_pred = temp_model(Xbatch)
            ybatch = y[i:i+batch_size]
            loss = loss_fn(y_pred, ybatch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if epoch % 100 == 0:
            print(f'Finished epoch {epoch}, latest loss {loss}')
        if loss < best_loss:
            best_loss = loss
            # Save model checkpoint
            torch.save({
                'epoch': epoch,
                'model_state_dict': temp_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
            }, 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/Models/' + model_name + '.pth')
            print("Saved improved model")

    test_model = PimaClassifier().to(device)
    checkpoint = torch.load('E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/Models/' + model_name + '.pth')
    test_model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    test_model.eval()
    with torch.no_grad():
        y_pred = test_model(X)

    accuracy = (y_pred.round() == y).float().mean()
    print(f"Accuracy {accuracy}")
    
    # make class predictions with the negmodel
    temp_probabilities = test_model(X).detach().float().to('cpu')
    temp_probabilities = torch.round(temp_probabilities * 1000) / 1000
    temp_probabilities = temp_probabilities.numpy()
    test_prob = pd.DataFrame(temp_probabilities,columns=['Probability'])
    test_prob.index = range(1, len(test_prob) + 1)
    temp_predictions = (test_model(X) > pred_cutoff).int().to('cpu')
    temp_predictions = temp_predictions.numpy()
    test_pred = pd.DataFrame(temp_predictions,columns=['Predicted'])
    test_pred.index = range(1, len(test_pred) + 1)
    actual_values = training_outcomes
    actual_values.columns = ['Actual']
    actual_values.index = range(1, len(actual_values) + 1)
    
    test = actual_values.join(test_pred)
    test2 = test.join(test_prob)
    test2.to_csv('E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/Predictions/training_predictions_' + model_name + '.tsv', sep='\t', index=False)

    TP = ((test['Actual'] == 1) & (test['Predicted'] == 1)).sum()
    TN = ((test['Actual'] == 0) & (test['Predicted'] == 0)).sum()
    FP = ((test['Actual'] == 0) & (test['Predicted'] == 1)).sum()
    FN = ((test['Actual'] == 1) & (test['Predicted'] == 0)).sum()
    
    # Creating a DataFrame for the confusion matrix
    confusion_matrix = pd.DataFrame([[TP, FN], [FP, TN]],
                                    columns=['Predicted Positive', 'Predicted Negative'],
                                    index=['Actual Positive', 'Actual Negative'])
    
    print("Confusion Matrix for training predictions:")
    print(confusion_matrix)
    print("Positive predictive power:")
    test_val = (confusion_matrix.iloc[0,0] / (confusion_matrix.iloc[0,0] + confusion_matrix.iloc[0,1]))*100
    print(str(round(test_val,2)) + "%")
    print("Positive predictive accuracy:")
    test_val = (confusion_matrix.iloc[0,0] / (confusion_matrix.iloc[0,0] + confusion_matrix.iloc[1,0]))*100
    print(str(round(test_val,2)) + "%")

    if actual_changes is not None:
            # Convert the list to a boolean array
            mask = np.array(((test['Actual'] == 0) & (test['Predicted'] == 1)), dtype=bool)
            
            # Use the mask to filter rows and calculate the mean
            mean_value = round(actual_changes.loc[mask,'x'].mean(),3)
            
            print("Mean change for incorrect predictions:", mean_value)
            
            # Convert the list to a boolean array
            mask = np.array(((test['Actual'] == 1) & (test['Predicted'] == 1)), dtype=bool)
            
            # Use the mask to filter rows and calculate the mean
            mean_value = round(actual_changes.loc[mask,'x'].mean(),3)
            
            print("Mean change for correct predictions:", mean_value)


def predictNN_Model_V4(model_name, prediction_name, prediction_input, pred_cutoff, actual_outcomes=None, actual_changes=None):
    # Ensure reproducibility
    np.random.seed(42)
    # Check for CUDA and set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    class PimaClassifier(nn.Module):
        def __init__(self):
            super().__init__()
            self.hidden1 = nn.Linear(prediction_input.shape[0], 64)
            self.act1 = nn.ReLU()
            self.hidden2 = nn.Linear(64, 12)
            self.act2 = nn.ReLU()
            self.hidden3 = nn.Linear(12, 8)
            self.act3 = nn.ReLU()
            self.output = nn.Linear(8, 1)
            self.act_output = nn.Sigmoid()
     
        def forward(self, x):
            x = self.act1(self.hidden1(x))
            x = self.act2(self.hidden2(x))
            x = self.act3(self.hidden3(x))
            x = self.act_output(self.output(x))
            return x
    
    model = PimaClassifier().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    checkpoint = torch.load('E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/Models/' + model_name + '.pth')
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    model.eval()
    model = model.to(device)
    
    input_tensor = torch.tensor(np.array(prediction_input.T), dtype=torch.float32)
    input_tensor = input_tensor.to(device)

    temp_probabilities = model(input_tensor).detach().float().to('cpu')
    temp_probabilities = torch.round(temp_probabilities * 1000) / 1000
    temp_probabilities = temp_probabilities.numpy()
    test_prob = pd.DataFrame(temp_probabilities,columns=['Probability'])
    test_prob.index = range(1, len(test_prob) + 1)
    
    predictions = (model(input_tensor) > pred_cutoff).int().to('cpu')
    print(len(predictions))
    predictions = predictions.numpy()

    if actual_outcomes is None:
        test_pred = pd.DataFrame(predictions,columns=['Predicted'])
        test_pred.index = range(1, len(test_pred) + 1)
        test2 = test_pred.join(test_prob)
        test2.to_csv('E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/Predictions/' + prediction_name + '_predictions_' + model_name + '_output.tsv', sep='\t', index=False)
    else:
        test_pred = pd.DataFrame(predictions,columns=['Predicted'])
        test_pred.index = range(1, len(test_pred) + 1)
        actual_values = actual_outcomes
        actual_values.columns = ['Actual']
        actual_values.index = range(1, len(actual_values) + 1)
        
        test = actual_values.join(test_pred)
        test2 = test.join(test_prob)
        test2.to_csv('E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/Predictions/' + prediction_name + '_predictions_' + model_name + '_output.tsv', sep='\t', index=False)
        
        TP = ((test['Actual'] == 1) & (test['Predicted'] == 1)).sum()
        TN = ((test['Actual'] == 0) & (test['Predicted'] == 0)).sum()
        FP = ((test['Actual'] == 0) & (test['Predicted'] == 1)).sum()
        FN = ((test['Actual'] == 1) & (test['Predicted'] == 0)).sum()
        
        # Creating a DataFrame for the confusion matrix
        confusion_matrix = pd.DataFrame([[TP, FN], [FP, TN]],
                                        columns=['Predicted Positive', 'Predicted Negative'],
                                        index=['Actual Positive', 'Actual Negative'])
        
        print("Confusion Matrix for predictions:")
        print(confusion_matrix)
        print("Positive predictive power:")
        test_val = (confusion_matrix.iloc[0,0] / (confusion_matrix.iloc[0,0] + confusion_matrix.iloc[0,1]))*100
        print(str(round(test_val,2)) + "%")
        print("Positive predictive accuracy:")
        test_val = (confusion_matrix.iloc[0,0] / (confusion_matrix.iloc[0,0] + confusion_matrix.iloc[1,0]))*100
        print(str(round(test_val,2)) + "%")
        
        if actual_changes is not None:
            # Convert the list to a boolean array
            mask = np.array(((test['Actual'] == 0) & (test['Predicted'] == 1)), dtype=bool)
            
            # Use the mask to filter rows and calculate the mean
            mean_value = round(actual_changes.loc[mask,'x'].mean(),3)
            
            print("Mean change for incorrect predictions:", mean_value)
            
            # Convert the list to a boolean array
            mask = np.array(((test['Actual'] == 1) & (test['Predicted'] == 1)), dtype=bool)
            
            # Use the mask to filter rows and calculate the mean
            mean_value = round(actual_changes.loc[mask,'x'].mean(),3)
            
            print("Mean change for correct predictions:", mean_value)

    
    return predictions

def comparePredictions_Model_V4(prediction_name, input_predictions, actual_outcomes, actual_changes=None):
    test_pred = pd.DataFrame(input_predictions,columns=['Predicted','Probability'])
    test_pred.index = range(1, len(test_pred) + 1)
    actual_values = actual_outcomes
    actual_values.columns = ['Actual']
    actual_values.index = range(1, len(actual_values) + 1)
        
    test = actual_values.join(test_pred)
    test.to_csv('E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/Predictions/other_predictions_' + prediction_name + '_output.tsv', sep='\t', index=False)
            
    TP = ((test['Actual'] == 1) & (test['Predicted'] == 1)).sum()
    TN = ((test['Actual'] == 0) & (test['Predicted'] == 0)).sum()
    FP = ((test['Actual'] == 0) & (test['Predicted'] == 1)).sum()
    FN = ((test['Actual'] == 1) & (test['Predicted'] == 0)).sum()
        
    # Creating a DataFrame for the confusion matrix
    confusion_matrix = pd.DataFrame([[TP, FN], [FP, TN]],
                                    columns=['Predicted Positive', 'Predicted Negative'],
                                    index=['Actual Positive', 'Actual Negative'])
        
    print("Confusion Matrix for predictions:")
    print(confusion_matrix)
    print("Positive predictive power:")
    test_val = (confusion_matrix.iloc[0,0] / (confusion_matrix.iloc[0,0] + confusion_matrix.iloc[0,1]))*100
    print(str(round(test_val,2)) + "%")
    print("Positive predictive accuracy:")
    test_val = (confusion_matrix.iloc[0,0] / (confusion_matrix.iloc[0,0] + confusion_matrix.iloc[1,0]))*100
    print(str(round(test_val,2)) + "%")
        
    if actual_changes is not None:
        # Convert the list to a boolean array
        mask = np.array(((test['Actual'] == 0) & (test['Predicted'] == 1)), dtype=bool)
            
        # Use the mask to filter rows and calculate the mean
        mean_value = round(actual_changes.loc[mask,'x'].mean(),3)
            
        print("Mean change for incorrect predictions:", mean_value)
            
        # Convert the list to a boolean array
        mask = np.array(((test['Actual'] == 1) & (test['Predicted'] == 1)), dtype=bool)
            
        # Use the mask to filter rows and calculate the mean
        mean_value = round(actual_changes.loc[mask,'x'].mean(),3)
        
        print("Mean change for correct predictions:", mean_value)




In [4]:

def createNN_pricetimingModel_V4(model_name, training_input, training_outcomes, epoch_number, batch_number, actual_changes=None):
    # Ensure reproducibility
    np.random.seed(42)
    
    print(f"Running price estimation model on: {model_name}")
    # Check for CUDA and set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    X = torch.tensor(np.array(training_input.T), dtype=torch.float32)
    y = torch.tensor(np.array(training_outcomes), dtype=torch.float32).reshape(-1, 1)

    # Move data to the chosen device
    X = X.to(device)
    y = y.to(device)
    class PimaRegressor(nn.Module):
        def __init__(self):
            super().__init__()
            self.hidden1 = nn.Linear(training_input.shape[0], 64)
            self.act1 = nn.ReLU()
            self.hidden2 = nn.Linear(64, 12)
            self.act2 = nn.ReLU()
            self.hidden3 = nn.Linear(12, 8)
            self.act3 = nn.ReLU()
            self.output = nn.Linear(8, 1)
            #self.act_output = nn.Sigmoid()
     
        def forward(self, x):
            x = self.act1(self.hidden1(x))
            x = self.act2(self.hidden2(x))
            x = self.act3(self.hidden3(x))
            x = self.output(x)
            #x = self.act_output(self.output(x))
            return x

    temp_model = PimaRegressor().to(device)
    print(temp_model)

    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(temp_model.parameters(), lr=0.001)
    
    n_epochs = epoch_number
    batch_size = batch_number
    best_loss = float('inf')

    for epoch in range(n_epochs):
        for i in range(0, len(X), batch_size):
            Xbatch = X[i:i+batch_size]
            y_pred = temp_model(Xbatch)
            ybatch = y[i:i+batch_size]
            loss = loss_fn(y_pred, ybatch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if epoch % 100 == 0:
            print(f'Finished epoch {epoch}, latest loss {loss}')
        if loss < best_loss:
            best_loss = loss
            # Save model checkpoint
            torch.save({
                'epoch': epoch,
                'model_state_dict': temp_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
            }, 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/Models/' + model_name + '.pth')
            print("Saved improved model")
        
    # compute accuracy (no_grad is optional)
    with torch.no_grad():
        y_pred = temp_model(X).to('cpu')
        y = y.to('cpu')
        rmse = torch.sqrt(loss_fn(y_pred, y))
        print(f"RMSE: {rmse.item()}")

def predictNN_pricetimingModel_V4(model_name, prediction_name, prediction_input, actual_outcomes=None, actual_changes=None):
    # Ensure reproducibility
    np.random.seed(42)
    # Check for CUDA and set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    class PimaRegressor(nn.Module):
        def __init__(self):
            super().__init__()
            self.hidden1 = nn.Linear(prediction_input.shape[0], 64)
            self.act1 = nn.ReLU()
            self.hidden2 = nn.Linear(64, 12)
            self.act2 = nn.ReLU()
            self.hidden3 = nn.Linear(12, 8)
            self.act3 = nn.ReLU()
            self.output = nn.Linear(8, 1)
            #self.act_output = nn.Sigmoid()
     
        def forward(self, x):
            x = self.act1(self.hidden1(x))
            x = self.act2(self.hidden2(x))
            x = self.act3(self.hidden3(x))
            x = self.output(x)
            #x = self.act_output(self.output(x))
            return x
    
    model = PimaRegressor().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    checkpoint = torch.load('E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/Models/' + model_name + '.pth')
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    model.eval()
    model = model.to(device)
    
    input_tensor = torch.tensor(np.array(prediction_input.T), dtype=torch.float32)
    input_tensor = input_tensor.to(device)

    with torch.no_grad():
        predictions = model(input_tensor).to('cpu')
        print(len(predictions))
        predictions = predictions.numpy()

        test_pred = pd.DataFrame(predictions,columns=['Predicted'])
        test_pred.index = range(1, len(test_pred) + 1)
        test_pred.to_csv('E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/Predictions/' + prediction_name + '_predictions_' + model_name + '_output.tsv', sep='\t', index=False)
        return predictions



In [5]:
# Ensure reproducibility
np.random.seed(42)

all_outcome_dataframes = {}

output2up5d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_5d02pctAbove_for_NN.tsv'
output2up5d_dataset = pd.read_csv(output2up5d_file, delimiter='\t')
dataframe_name = f'output2up5d_dataset'
all_outcome_dataframes[dataframe_name] = output2up5d_dataset

output4up10d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_10d04pctAbove_for_NN.tsv'
output4up10d_dataset = pd.read_csv(output4up10d_file, delimiter='\t')
dataframe_name = f'output4up10d_dataset'
all_outcome_dataframes[dataframe_name] = output4up10d_dataset

output6up15d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_15d06pctAbove_for_NN.tsv'
output6up15d_dataset = pd.read_csv(output6up15d_file, delimiter='\t')
dataframe_name = f'output6up15d_dataset'
all_outcome_dataframes[dataframe_name] = output6up15d_dataset

output8up20d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_20d08pctAbove_for_NN.tsv'
output8up20d_dataset = pd.read_csv(output8up20d_file, delimiter='\t')
dataframe_name = f'output8up20d_dataset'
all_outcome_dataframes[dataframe_name] = output8up20d_dataset

output10up25d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_25d10pctAbove_for_NN.tsv'
output10up25d_dataset = pd.read_csv(output10up25d_file, delimiter='\t')
dataframe_name = f'output10up25d_dataset'
all_outcome_dataframes[dataframe_name] = output10up25d_dataset

output20up50d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_50d20pctAbove_for_NN.tsv'
output20up50d_dataset = pd.read_csv(output20up50d_file, delimiter='\t')
dataframe_name = f'output20up50d_dataset'
all_outcome_dataframes[dataframe_name] = output20up50d_dataset

negoutput5d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_5d0pctBelow_for_NN.tsv'
negoutput5d_dataset = pd.read_csv(negoutput5d_file, delimiter='\t')
dataframe_name = f'negoutput5d_dataset'
all_outcome_dataframes[dataframe_name] = negoutput5d_dataset

negoutput10d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_10d0pctBelow_for_NN.tsv'
negoutput10d_dataset = pd.read_csv(negoutput10d_file, delimiter='\t')
dataframe_name = f'negoutput10d_dataset'
all_outcome_dataframes[dataframe_name] = negoutput10d_dataset

negoutput15d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_15d0pctBelow_for_NN.tsv'
negoutput15d_dataset = pd.read_csv(negoutput15d_file, delimiter='\t')
dataframe_name = f'negoutput15d_dataset'
all_outcome_dataframes[dataframe_name] = negoutput15d_dataset

negoutput20d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_20d0pctBelow_for_NN.tsv'
negoutput20d_dataset = pd.read_csv(negoutput20d_file, delimiter='\t')
dataframe_name = f'negoutput20d_dataset'
all_outcome_dataframes[dataframe_name] = negoutput20d_dataset

negoutput25d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_25d0pctBelow_for_NN.tsv'
negoutput25d_dataset = pd.read_csv(negoutput25d_file, delimiter='\t')
dataframe_name = f'negoutput25d_dataset'
all_outcome_dataframes[dataframe_name] = negoutput25d_dataset

negoutput50d_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_50d0pctBelow_for_NN.tsv'
negoutput50d_dataset = pd.read_csv(negoutput50d_file, delimiter='\t')
dataframe_name = f'negoutput50d_dataset'
all_outcome_dataframes[dataframe_name] = negoutput50d_dataset

for i in range(1, 51):
    # Update the file name with the current iteration number
    temp_file = f'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/test_output_V4_avgOpenClose{i}dAhead_for_NN.tsv'
    # Read the file into a DataFrame
    temp_df = pd.read_csv(temp_file, delimiter='\t')
    # Designate the name for the DataFrame
    temp_dataframe_name = f'change{i}d_dataset'
    # Add the DataFrame to the dictionary with the designated name
    all_outcome_dataframes[temp_dataframe_name] = temp_df

# Now iterate through the dictionary and execute the specified code for each DataFrame
for temp_dataframe_name, temp_df in all_outcome_dataframes.items():
    print(f"{temp_dataframe_name}:")
    print(temp_df.iloc[:5, :5])
    print(temp_df.shape)
    #print(df.value_counts())
    print()  # Adding a newline for better readability between outputs





output2up5d_dataset:
   x
1  0
2  0
3  1
4  0
5  0
(318034, 1)

output4up10d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

output6up15d_dataset:
   x
1  0
2  0
3  1
4  1
5  1
(318034, 1)

output8up20d_dataset:
   x
1  1
2  1
3  1
4  1
5  1
(318034, 1)

output10up25d_dataset:
   x
1  1
2  1
3  1
4  1
5  1
(318034, 1)

output20up50d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

negoutput5d_dataset:
   x
1  1
2  0
3  0
4  0
5  0
(318034, 1)

negoutput10d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

negoutput15d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

negoutput20d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

negoutput25d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

negoutput50d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

change1d_dataset:
       x
1  0.997
2  0.991
3  1.007
4  1.011
5  1.004
(318034, 1)

change2d_dataset:
       x
1  0.988
2  0.998
3  1.018
4  1.014
5  1.008
(318034, 1)

change3d_dataset:
       x
1  0.995
2  1.009
3  1.

In [6]:

prediction_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/test_input_V4_for_prediction_with_NN_20240314_to_20240802.tsv'

# Read the tab-separated file
# The delimiter '\t' specifies that the fields are separated by tabs
prediction_dataset = pd.read_csv(prediction_file, delimiter='\t')

realtime_predictions = {}

for temp_dataframe_name, temp_df in all_outcome_dataframes.items():
    temp_search_string = "change"
    if temp_search_string in temp_dataframe_name:
        temp_name = f'price_estimate_model_V4_{temp_dataframe_name}'
        realtime_predictions[temp_dataframe_name] = predictNN_pricetimingModel_V4(temp_name, "realtime_data", prediction_dataset)
    else:
        temp_name = f'binary_classification_model_V4_{temp_dataframe_name}'
        realtime_predictions[temp_dataframe_name] = predictNN_Model_V4(temp_name, "realtime_data", prediction_dataset, 0.9)



Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549
Using device: cuda
52549


In [17]:
###To do before utilizing model
###Check prediction accuracy by day of week
###Visualize investment simulation results for recent testing data
###Layer on sentiment score to see if it can filter out incorrect predictions

###Import dataset

input_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/test_input_V4_for_prediction_with_NN_20240314_to_20240228.tsv'

# Read the tab-separated file
# The delimiter '\t' specifies that the fields are separated by tabs
input_dataset = pd.read_csv(input_file, delimiter='\t')


# Display the first few rows of the DataFrame
print(input_dataset.iloc[:5,:5])
print(input_dataset.shape)

# Ensure reproducibility
np.random.seed(42)

# Shuffle column names
shuffled_columns = np.random.permutation(input_dataset.columns)

# Calculate split sizes
total_columns = len(input_dataset.columns)
train_size = int(0.8 * total_columns)
# The remaining columns will go to the test set

# Split the columns
train_columns = shuffled_columns[:train_size]
test_columns = shuffled_columns[train_size:]

# Create subsets of the DataFrame based on the columns
train_df = input_dataset[train_columns]
train_df.to_csv('E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/primary_training_V4_input.tsv', sep='\t', index=False)
test_df = input_dataset[test_columns]
test_df.to_csv('E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/primary_testing_V4_input.tsv', sep='\t', index=False)




                                                    A_20210723  A_20210726  \
EOD_option_call_total_volume_strike_NAto0.5_exp...       0.394       0.387   
EOD_option_call_total_open_interest_strike_NAto...       0.351       0.344   
EOD_option_put_total_volume_strike_NAto0.5_expi...       0.391       0.384   
EOD_option_put_total_open_interest_strike_NAto0...       0.250       0.243   
EOD_option_call_total_volume_strike_NAto0.5_exp...       0.395       0.389   

                                                    A_20210727  A_20210728  \
EOD_option_call_total_volume_strike_NAto0.5_exp...       0.382       0.354   
EOD_option_call_total_open_interest_strike_NAto...       0.339       0.310   
EOD_option_put_total_volume_strike_NAto0.5_expi...       0.379       0.351   
EOD_option_put_total_open_interest_strike_NAto0...       0.239       0.212   
EOD_option_call_total_volume_strike_NAto0.5_exp...       0.384       0.356   

                                                    A_20210729

output2up5d_dataset:
   x
1  0
2  0
3  1
4  0
5  0
(318034, 1)

output4up10d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

output6up15d_dataset:
   x
1  0
2  0
3  1
4  1
5  1
(318034, 1)

output8up20d_dataset:
   x
1  1
2  1
3  1
4  1
5  1
(318034, 1)

output10up25d_dataset:
   x
1  1
2  1
3  1
4  1
5  1
(318034, 1)

output20up50d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

negoutput5d_dataset:
   x
1  1
2  0
3  0
4  0
5  0
(318034, 1)

negoutput10d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

negoutput15d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

negoutput20d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

negoutput25d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

negoutput50d_dataset:
   x
1  0
2  0
3  0
4  0
5  0
(318034, 1)

change1d_dataset:
       x
1  0.997
2  0.991
3  1.007
4  1.011
5  1.004
(318034, 1)

change2d_dataset:
       x
1  0.988
2  0.998
3  1.018
4  1.014
5  1.008
(318034, 1)

change3d_dataset:
       x
1  0.995
2  1.009
3  1.

In [19]:

training_outcomes = {}
testing_outcomes = {}

for temp_dataframe_name, temp_df in all_outcome_dataframes.items():
    temp_df.index = input_dataset.columns
    training_outcomes[temp_dataframe_name] = temp_df.T[train_columns].T
    testing_outcomes[temp_dataframe_name] = temp_df.T[test_columns].T
    
    training_outcomes[temp_dataframe_name].index = range(1, len(training_outcomes[temp_dataframe_name]) + 1)
    temp_save_path = f'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/primary_training_V4_{temp_dataframe_name}_output.tsv'
    training_outcomes[temp_dataframe_name].to_csv(temp_save_path, sep='\t', index=False)

    testing_outcomes[temp_dataframe_name].index = range(1, len(testing_outcomes[temp_dataframe_name]) + 1)
    temp_save_path = f'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/Freeze20240404/primary_testing_V4_{temp_dataframe_name}_output.tsv'
    testing_outcomes[temp_dataframe_name].to_csv(temp_save_path, sep='\t', index=False)

    all_outcome_dataframes[temp_dataframe_name] = temp_df
    



In [21]:

testing_predictions = {}

for temp_dataframe_name, temp_df in all_outcome_dataframes.items():
    temp_search_value = "neg"
    if temp_search_value in temp_dataframe_name:
        temp_pos_weight = 3
        temp_pred_cutoff = 0.9
    else:
        temp_pos_weight = 1.25
        temp_pred_cutoff = 0.9
    temp_testing_outcome = testing_outcomes[temp_dataframe_name]
    temp_training_outcome = training_outcomes[temp_dataframe_name]
    temp_search_string = "change"
    if temp_search_string in temp_dataframe_name:
        temp_name = f'price_estimate_model_V4_{temp_dataframe_name}'
        createNN_pricetimingModel_V4(temp_name, train_df, temp_training_outcome, 2000, 2000)
        testing_predictions[temp_dataframe_name] = predictNN_pricetimingModel_V4(temp_name, "testing_data", test_df, temp_testing_outcome)
    else:
        temp_name = f'binary_classification_model_V4_{temp_dataframe_name}'
        createNN_Model_V4(temp_name, train_df, temp_training_outcome, temp_pos_weight, 2000, 2000, temp_pred_cutoff)
        testing_predictions[temp_dataframe_name] = predictNN_Model_V4(temp_name, "testing_data", test_df, temp_pred_cutoff, temp_testing_outcome)



Running binary classification model on: binary_classification_model_V4_output2up5d_dataset
Using device: cuda
PimaClassifier(
  (hidden1): Linear(in_features=1622, out_features=64, bias=True)
  (act1): ReLU()
  (hidden2): Linear(in_features=64, out_features=12, bias=True)
  (act2): ReLU()
  (hidden3): Linear(in_features=12, out_features=8, bias=True)
  (act3): ReLU()
  (output): Linear(in_features=8, out_features=1, bias=True)
  (act_output): Sigmoid()
)
Finished epoch 0, latest loss 0.729744017124176
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved impr

KeyError: 'rmse'

TypeError: dict.keys() takes no arguments (1 given)

In [24]:

subset_dict = {key: value for key, value in all_outcome_dataframes.items() if "change" in key}

testing_predictions = {}

for temp_dataframe_name, temp_df in subset_dict.items():
    temp_search_value = "neg"
    if temp_search_value in temp_dataframe_name:
        temp_pos_weight = 3
        temp_pred_cutoff = 0.9
    else:
        temp_pos_weight = 1.25
        temp_pred_cutoff = 0.9
    temp_testing_outcome = testing_outcomes[temp_dataframe_name]
    temp_search_string = "change"
    if temp_search_string in temp_dataframe_name:
        temp_name = f'price_estimate_model_V4_{temp_dataframe_name}'
        createNN_pricetimingModel_V4(temp_name, train_df, temp_training_outcome, 2000, 2000)
        testing_predictions[temp_dataframe_name] = predictNN_pricetimingModel_V4(temp_name, "testing_data", test_df, temp_testing_outcome)
    else:
        temp_name = f'binary_classification_model_V4_{temp_dataframe_name}'
        createNN_Model_V4(temp_name, train_df, temp_training_outcome, temp_pos_weight, 2000, 2000, temp_pred_cutoff)
        testing_predictions[temp_dataframe_name] = predictNN_Model_V4(temp_name, "testing_data", test_df, temp_pred_cutoff, temp_testing_outcome)




Running price estimation model on: price_estimate_model_V4_change1d_dataset
Using device: cuda
PimaRegressor(
  (hidden1): Linear(in_features=1622, out_features=64, bias=True)
  (act1): ReLU()
  (hidden2): Linear(in_features=64, out_features=12, bias=True)
  (act2): ReLU()
  (hidden3): Linear(in_features=12, out_features=8, bias=True)
  (act3): ReLU()
  (output): Linear(in_features=8, out_features=1, bias=True)
)
Finished epoch 0, latest loss 0.007220463827252388
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved improved model
Saved i

In [6]:
print(all_outcome_dataframes.keys())

dict_keys(['output2up5d_dataset', 'output4up10d_dataset', 'output6up15d_dataset', 'output8up20d_dataset', 'output10up25d_dataset', 'output20up50d_dataset', 'negoutput5d_dataset', 'negoutput10d_dataset', 'negoutput15d_dataset', 'negoutput20d_dataset', 'negoutput25d_dataset', 'negoutput50d_dataset', 'change1d_dataset', 'change2d_dataset', 'change3d_dataset', 'change4d_dataset', 'change5d_dataset', 'change6d_dataset', 'change7d_dataset', 'change8d_dataset', 'change9d_dataset', 'change10d_dataset', 'change11d_dataset', 'change12d_dataset', 'change13d_dataset', 'change14d_dataset', 'change15d_dataset', 'change16d_dataset', 'change17d_dataset', 'change18d_dataset', 'change19d_dataset', 'change20d_dataset', 'change21d_dataset', 'change22d_dataset', 'change23d_dataset', 'change24d_dataset', 'change25d_dataset', 'change26d_dataset', 'change27d_dataset', 'change28d_dataset', 'change29d_dataset', 'change30d_dataset', 'change31d_dataset', 'change32d_dataset', 'change33d_dataset', 'change34d_datase

In [6]:

prediction_file = 'E:/Market_Data/DiscountOptionData/DTNSubscription/revised_derived_aggregates/test_input_V4_for_prediction_with_NN_20240314_to_20240710.tsv'

# Read the tab-separated file
# The delimiter '\t' specifies that the fields are separated by tabs
prediction_dataset = pd.read_csv(prediction_file, delimiter='\t')

realtime_predictions = {}

for temp_dataframe_name, temp_df in all_outcome_dataframes.items():
    temp_search_string = "change"
    if temp_search_string in temp_dataframe_name:
        temp_name = f'price_estimate_model_V4_{temp_dataframe_name}'
        realtime_predictions[temp_dataframe_name] = predictNN_pricetimingModel_V4(temp_name, "realtime_data", prediction_dataset)
    else:
        temp_name = f'binary_classification_model_V4_{temp_dataframe_name}'
        realtime_predictions[temp_dataframe_name] = predictNN_Model_V4(temp_name, "realtime_data", prediction_dataset, 0.9)



Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
Using device: cuda
44134
