# Sequence Preparation
This notebook is meant to prepare a data file for use in ML experiments.  The data file will be a CSV that is updated with a sequence group column and a label (target) column for a classification problem using machine learning models to predict the target.  The sequence group column will be used to group sequences that are related to each other.  The label column will be used to indicate the target value for each sequence.  The data file will be used to train and test machine learning models to predict the target value for new sequences.

In [3]:
# perform the imports
import pandas as pd


In [15]:
# Declare the global variables
file_in = './data/sample.csv'
file_out = './data_prod/sample.csv'
rows_per_group = 5  # Adjust as needed
columns_to_write = ['date', 'open', 'high', 'low', 'close', 'volume', 'sequence_group']



In [17]:
# Load the data
df = pd.read_csv(file_in)

# Convert 'date' column to datetime format for easier manipulation
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y %H:%M')

# Initialize the sequence_group column
df['sequence_group'] = None

# Set the number of rows per group

# Group by day and apply the logic for setting sequence_group values
group_number = 1
for _, group in df.groupby(df['date'].dt.date):
    num_full_groups = len(group) // rows_per_group
    if num_full_groups > 0:
        for i in range(num_full_groups):
            df.loc[group.index[i*rows_per_group:(i+1)*rows_per_group], 'sequence_group'] = group_number
            group_number += 1

# Remove rows where sequence_group is None
df_final = df.dropna(subset=['sequence_group'])

# Save the modified dataframe with only the specified columns to a new CSV file
df_final.to_csv(file_out, index=False, columns=columns_to_write)
print(f"Output file saved to {file_out}")


Output file saved to ./data_prod/sample.csv
