# Sequence Preparation
This notebook is meant to prepare a data file for use in ML experiments.  The data file will be a CSV that is updated with a sequence group column and a label (target) column for a classification problem using machine learning models to predict the target.  The sequence group column will be used to group sequences that are related to each other.  The label column will be used to indicate the target value for each sequence.  The data file will be used to train and test machine learning models to predict the target value for new sequences.

In [5]:
# perform the imports
import pandas as pd


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [6]:
# Declare the global variables
file_in = './data/sample.csv'
file_out = './data_prod/sample.csv'
rows_per_group = 5  # Adjust as needed
columns_to_write = ['date', 'open', 'high', 'low', 'close', 'volume', 'sequence_group']

In [21]:
# Load the data, apply a sequence and filter the output columns
df = pd.read_csv(file_in)

# Convert 'date' column to datetime format for easier manipulation
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y %H:%M')

# Initialize the sequence_group column
df['sequence_group'] = None

# Set the number of rows per group

# Group by day and apply the logic for setting sequence_group values
group_number = 1
for _, group in df.groupby(df['date'].dt.date):
    num_full_groups = len(group) // rows_per_group
    if num_full_groups > 0:
        for i in range(num_full_groups):
            df.loc[group.index[i*rows_per_group:(i+1)*rows_per_group], 'sequence_group'] = group_number
            group_number += 1

# Remove rows where sequence_group is None
df_final = df.dropna(subset=['sequence_group'])

# Save the modified dataframe with only the specified columns to a new CSV file
df_final.to_csv(file_out, index=False, columns=columns_to_write)
print(f"Output file saved to {file_out}")

Output file saved to ./data_prod/sample.csv


In [1]:
# Apply normalization to the data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [15]:
import pandas as pd

rows_per_group = 2

# Assume the DataFrame is read from a CSV file or similar.
# For the purpose of this example, we'll create the DataFrame manually.
data = {
    'barcount': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'date': ['8/1/2023 07:01', '8/1/2023 07:02', '8/1/2023 07:03', '8/1/2023 07:04', '8/2/2023 07:05', '8/2/2023 07:06', '8/2/2023 07:07', '8/3/2023 07:08', '8/3/2023 07:09', '8/4/2023 07:10'],
    'feature1': ['a', 'c', 'e', 'g', 'i', 'k', 'm', 'o', 'q', 's'],
    'feature2': ['b', 'd', 'f', 'h', 'j','l', 'n', 'p', 'r', 't'],
    'reversal': [False, False, True, False, True, False, False, True, False, False]
}

# Create the DataFrame
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'])
df['logical_date'] = df['date'].dt.date

# Initialize an empty DataFrame to hold the final results
final_df = pd.DataFrame()

# Loop over the DataFrame to create rolling windows of 3 records
for start in range(len(df) - (rows_per_group-1)):
    window = df.iloc[start:start+rows_per_group]  # Get the window of 3 records
    sequence_group = start + 1  # Define the sequence group based on the loop iteration
    future_reversal = df.iloc[start+rows_per_group]['reversal'] if start+rows_per_group < len(df) else None  # Get the future reversal
    
    # Copy the window and add the new columns
    window_copy = window.copy()
    window_copy['sequence_group'] = sequence_group
    window_copy['future_reversal'] = future_reversal
    
    # Append the window copy to the final DataFrame
    final_df = pd.concat([final_df, window_copy], ignore_index=True)

# Since the last record doesn't have a next record, we can't compute future_reversal for it
# Removing the last row from the final DataFrame
final_df = final_df[:-1]

# Show the final DataFrame
final_df

Unnamed: 0,barcount,date,feature1,feature2,reversal,logical_date,sequence_group,future_reversal
0,1,2023-08-01 07:01:00,a,b,False,2023-08-01,1,True
1,2,2023-08-01 07:02:00,c,d,False,2023-08-01,1,True
2,2,2023-08-01 07:02:00,c,d,False,2023-08-01,2,False
3,3,2023-08-01 07:03:00,e,f,True,2023-08-01,2,False
4,3,2023-08-01 07:03:00,e,f,True,2023-08-01,3,True
5,4,2023-08-01 07:04:00,g,h,False,2023-08-01,3,True
6,4,2023-08-01 07:04:00,g,h,False,2023-08-01,4,False
7,5,2023-08-02 07:05:00,i,j,True,2023-08-02,4,False
8,5,2023-08-02 07:05:00,i,j,True,2023-08-02,5,False
9,6,2023-08-02 07:06:00,k,l,False,2023-08-02,5,False
