In [1]:
import pandas as pd
import numpy as np 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import os

2024-11-11 08:37:00.485930: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load data set 
all_merged_data = pd.read_csv("all_merged_data.csv")

In [3]:
sample_size_test = int(all_merged_data.shape[1] * 0.1)  # Calculate 10% of the columns

# Randomly select 10% of the columns
sampled_columns_test = all_merged_data.sample(n=sample_size_test, axis=1, random_state=42)  # Use random_state for reproducibility

# Separate the sampled columns as test columns and the rest as train columns
train = all_merged_data.drop(columns=sampled_columns_test.columns)  # Remaining 90% of columns
test = sampled_columns_test  # 10% of randomly selected columns
test['date'] = all_merged_data['date']  # Add 'date' back to test
train['date'] = all_merged_data['date']  # Add 'date' back to train


sample_size_dev = int(train.shape[1] * 0.1)  # Calculate 10% of the columns
# Randomly select 10% of the columns
sampled_columns_dev = train.sample(n=sample_size_dev, axis=1, random_state=42)  # Use random_state for reproducibility

# Separate the sampled columns as dev columns and the rest as true train columns
train = train.drop(columns=sampled_columns_dev.columns)  # Remaining 90% of columns
dev = sampled_columns_dev  # 10% of randomly selected columns

In [4]:
# Function to create sequences for one keyword trend
def create_sequences_for_keyword(data_column, time_step):
    X, y = [], []
    for i in range(len(data_column) - time_step):
        X.append(data_column[i:i + time_step])  # Sequence of past 'time_step' values
        y.append(data_column[i + time_step])    # The next value to predict
    return np.array(X), np.array(y)

In [5]:
# Define the LSTM models to train 

layers = list(range(2, 8))
time_steps = list(range(10, 60, 10))
nodes = [50,75,100]
dropouts = [0.1, 0.2, 0.3]

model_list = [] 

for layer in layers: 
    for time_step in time_steps: 
        for node in nodes:
            for dropout in dropouts: 

                model = Sequential()
                model.add(LSTM(units= node, activation= 'tanh', input_shape= (time_step, 1), return_sequences=True))
                model.add(Dropout(dropout))

                for i in range(layer - 2):  # Starts at 0 and runs for (layer - 2) times
                    model.add(LSTM(units=node, activation='tanh', return_sequences = True))
                    model.add(Dropout(dropout))

                model.add(LSTM(units=node, activation='tanh')) # final hidden layer, not returning sequences 
                model.add(Dropout(dropout))
        
                model.add(Dense(1))  # Output layer to predict the next value for the trend

                model.compile(optimizer='adam', loss='mean_squared_error')
         
                model_list.append(model)

  super().__init__(**kwargs)


In [None]:
average_mse_per_model = []
layers_per_model = []
nodes_per_model = []
dropout_per_model = []
time_step_per_model = []

for model in model_list: 
    time_step = model.input_shape[1]
    time_step_per_model.append(time_step)
    layers_per_model.append(len(model.layers))
    nodes_per_model.append(model.layers[0].units)
    dropout_per_model.append(model.layers[1].rate)
    
    mse_list = []

    # Iterate over each keyword column, excluding 'date'
    for keyword in train.columns[1:]:  # Skip the 'date' column
        print(f"Training RNN model for keyword: {keyword}")
        
        # Extract the single keyword's trend data
        single_keyword_data = train[keyword].values  # Extract the column data as an array
        
        # Create sequences for the single keyword
        X, y = create_sequences_for_keyword(single_keyword_data, time_step)
        
        # Reshape X to match RNN input format (samples, timesteps, features)
        X = X.reshape((X.shape[0], X.shape[1], 1))  # 1 feature per keyword
        
        # Continue training the model on this keyword's data 
        history = model.fit(X, y, epochs=1, batch_size=16, verbose=1)
    
    # once all data has been seen in training evaluate MSE 
    
    for keyword in dev.columns[1:]: #skip the 'date' column
        # Extract the single keyword's trend data
        single_keyword_data = dev[keyword].values  # Extract the column data as an array
    
        # Create sequences for the single keyword
        X_dev, y_dev = create_sequences_for_keyword(single_keyword_data, time_step)

        # Predict on the validation set
        y_pred = model.predict(X_dev)
    
        # Compute MSE for this keyword
        mse = mean_squared_error(y_dev, y_pred)
        mse_list.append(mse)

    
    average_mse = sum(mse_list) / len(mse_list)
    average_mse_per_model.append(average_mse)



    # create and save a dataframe with the latest results 
    results = pd.DataFrame(
        {'Average MSE': average_mse_per_model,
         'Number of Layers': layers_per_model,
         'Number of Nodes': nodes_per_model,
         'Dropout Rate': dropout_per_model, 
         'Time Step': time_step_per_model
        })

    results.to_csv('HyperParameterTuningResults.csv', index=False)


Training RNN model for keyword: Dumbbells_x
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - loss: 1.0129
Training RNN model for keyword: Yoga mats_x
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0557
Training RNN model for keyword: Resistance bands_x
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0444
Training RNN model for keyword: Kettlebells
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0019
Training RNN model for keyword: Treadmills_x
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0136
Training RNN model for keyword: Jump ropes_x
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0135 
Training RNN model for keyword: Protein powder_x
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.2903
Training RNN model for keyword: Exercise balls
[1m15/15[0