# Reading the Data

In [10]:
import os
import pandas as pd
from scipy.io import arff

def read_arff_file(filepath, dimension_name):
    """
    Reads an ARFF file, converts the data to a DataFrame, and 
    renames the columns as '[dimension_name]_t1', '[dimension_name]_t2', etc.
    """
    data, meta = arff.loadarff(filepath)
    df = pd.DataFrame(data)
    
    # Rename columns using the dimension name and time step index.
    num_columns = df.shape[1]
    df.columns = [f"{dimension_name}_t{i+1}" for i in range(num_columns)]
    
    return df

# Creating the Table (Using Pandas)

In [11]:
def process_dataset_folder(folder_path, dataset_name):
    """
    Processes one dataset folder by reading ARFF files and combining 
    them into a table. For each file, the function determines the data
    split (train or test) based on the filename and assigns a dimension name.
    It horizontally concatenates data for the same split,
    then vertically concatenates train and test data.
    """
    # List only ARFF files (ignore .txt, .png, .jpg, etc.)
    arff_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.arff')]
    
    # Lists to store DataFrames for train and test files separately.
    train_dfs = []
    test_dfs = []

    for file in arff_files:
        filepath = os.path.join(folder_path, file)
        
        # Determine if the file is train or test based on its filename.
        if "TRAIN" in file.upper():
            split_label = "train"
        elif "TEST" in file.upper():
            split_label = "test"
        else:
            print(f"Warning: Could not determine split (TRAIN/TEST) from file: {file}")
            continue
        
        # Extract the dimension name by removing the '_TRAIN' or '_TEST' portion.
        base_name = os.path.splitext(file)[0]  # e.g., "NATOPSDimension1_TRAIN"
        if "_TRAIN" in base_name.upper():
            dimension_name = base_name.upper().replace("_TRAIN", "")
        elif "_TEST" in base_name.upper():
            dimension_name = base_name.upper().replace("_TEST", "")
        else:
            dimension_name = base_name
        
        # For readability, convert the name to Title Case.
        dimension_name = dimension_name.title()
        
        # Read the ARFF file into a DataFrame for this sensor dimension.
        df_dim = read_arff_file(filepath, dimension_name)
        # Add the split information to this DataFrame.
        df_dim['split'] = split_label
        
        # Append to the appropriate list.
        if split_label == "train":
            train_dfs.append(df_dim)
        else:
            test_dfs.append(df_dim)
    
    # Concatenate data horizontally for the training part.
    if train_dfs:
        df_train = pd.concat(train_dfs, axis=1)
        # Remove duplicate columns (e.g., multiple "split" columns).
        df_train = df_train.loc[:, ~df_train.columns.duplicated()]
        # Insert sample ID column.
        df_train.insert(0, 'sid', range(1, len(df_train) + 1))
    else:
        df_train = pd.DataFrame()
    
    # Concatenate data horizontally for the testing part.
    if test_dfs:
        df_test = pd.concat(test_dfs, axis=1)
        df_test = df_test.loc[:, ~df_test.columns.duplicated()]
        df_test.insert(0, 'sid', range(1, len(df_test) + 1))
    else:
        df_test = pd.DataFrame()
    
    # Vertically stack the train and test DataFrames.
    df_dataset = pd.concat([df_train, df_test], ignore_index=True)
    
    # Optionally add a column to track the dataset source.
    df_dataset["dataset"] = dataset_name
    
    return df_dataset

def create_final_table():
    """
    Creates the final table by processing multiple dataset folders.
    For this example, we process the 'JapaneseVowels' and 'NATOPS' folders
    under 'Phase1_Data' and then combine them.
    """
    base_folder = 'Phase1_Data'
    
    # Define dataset folders (adjust as needed).
    dataset_folders = {
        "JapaneseVowels": os.path.join(base_folder, "JapaneseVowels"),
        "NATOPS": os.path.join(base_folder, "NATOPS")
    }
    
    all_datasets = []
    for dataset_name, folder_path in dataset_folders.items():
        if os.path.exists(folder_path):
            df_dataset = process_dataset_folder(folder_path, dataset_name)
            all_datasets.append(df_dataset)
        else:
            print(f"Warning: Folder not found for dataset: {dataset_name}")
    
    # Combine all dataset tables into one master DataFrame.
    final_table = pd.concat(all_datasets, ignore_index=True) if all_datasets else pd.DataFrame()
    return final_table

# Training

In [12]:
def train_model(data):
    """
    Placeholder for model training.
    In future phases, this function might preprocess features,
    split into train/test sets further, and train a machine learning model.
    """
    print("Training phase not implemented in Phase 1.")
    # Replace the following line with actual training code in future phases.
    model = None
    return model

# Testing

In [13]:
def main():
    # Create the final table by combining data from the dataset folders
    final_table = create_final_table()

    # Save the final table to a CSV file.
    output_filename = 'Phase1_Combined.csv'
    final_table.to_csv(output_filename, index=False)
    print(f"Final table saved to {output_filename}")

    # Print the first 5 rows for a quick check.
    print("First 5 rows of the final table:")
    print(final_table.head(5))

    # Placeholder for the training phase.
    model = train_model(final_table)

    # Placeholder for the testing phase.
    test_model(model, final_table)

if __name__ == '__main__':
    main()

Final table saved to Phase1_Combined.csv
First 5 rows of the final table:
   sid  Japanesevowelsdimension10_t1  Japanesevowelsdimension10_t2  \
0    1                     -0.306756                     -0.289431   
1    2                     -0.173138                     -0.181910   
2    3                     -0.237630                     -0.231087   
3    4                      0.028707                      0.038970   
4    5                     -0.115333                     -0.106838   

   Japanesevowelsdimension10_t3  Japanesevowelsdimension10_t4  \
0                     -0.314894                     -0.323267   
1                     -0.127751                     -0.182744   
2                     -0.224317                     -0.208580   
3                      0.005654                     -0.053426   
4                     -0.125721                     -0.159460   

   Japanesevowelsdimension10_t5  Japanesevowelsdimension10_t6  \
0                     -0.351171                  