# Reading the Data

In [1]:
import os
import pandas as pd
from scipy.io import arff

def read_arff_file(filepath, dimension_name):
    """
    Reads an ARFF file, converts the data to a DataFrame, and 
    renames the columns as '[dimension_name]_t1', '[dimension_name]_t2', etc.
    """
    data, meta = arff.loadarff(filepath)
    df = pd.DataFrame(data)
    num_columns = df.shape[1]
    df.columns = [f"{dimension_name}_t{i+1}" for i in range(num_columns)]
    return df

# Creating the Table (Using Pandas)

In [2]:
def process_dataset_folder(folder_path, dataset_name):
    """
    Processes one dataset folder by reading ARFF files and combining 
    them into a single table, preserving train/test splits and adding 'sid'.
    """
    arff_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.arff')]
    train_dfs, test_dfs = [], []

    for file in arff_files:
        split = 'train' if 'TRAIN' in file.upper() else 'test' if 'TEST' in file.upper() else None
        if not split:
            print(f"Warning: couldn't mark split for {file}")
            continue

        base = os.path.splitext(file)[0]
        # strip the suffix, keep everything before '_TRAIN' or '_TEST'
        dimension = base.replace('_TRAIN', '').replace('_TEST', '')
        
        df_dim = read_arff_file(os.path.join(folder_path, file), dimension.title())
        df_dim['split'] = split

        (train_dfs if split == 'train' else test_dfs).append(df_dim)

    def concat_and_sid(dfs):
        if not dfs:
            return pd.DataFrame()
        df = pd.concat(dfs, axis=1)
        df = df.loc[:, ~df.columns.duplicated()]
        df.insert(0, 'sid', range(1, len(df) + 1))
        return df

    df_train = concat_and_sid(train_dfs)
    df_test  = concat_and_sid(test_dfs)
    df_all   = pd.concat([df_train, df_test], ignore_index=True)
    df_all['dataset'] = dataset_name
    return df_all

def create_final_table():
    base = 'Phase1_Data'
    natops_folder = os.path.join(base, 'NATOPS')
    if not os.path.isdir(natops_folder):
        raise FileNotFoundError(f"NATOPS folder not found at {natops_folder}")
    return process_dataset_folder(natops_folder, 'NATOPS')

# Testing

In [3]:
def main():
    final_table = create_final_table()
    out_csv = 'Phase1_NATOPS_Combined.csv'
    final_table.to_csv(out_csv, index=False)
    print(f"Saved combined NATOPS data to {out_csv}")
    print("First 5 rows:")
    print(final_table.head(5))

if __name__ == '__main__':
    main()

Saved combined NATOPS data to Phase1_NATOPS_Combined.csv
First 5 rows:
   sid  Natopsdimension10_t1  Natopsdimension10_t2  Natopsdimension10_t3  \
0    1              0.599967              0.597535              0.597007   
1    2              0.622368              0.622228              0.622004   
2    3              0.588525              0.588389              0.588164   
3    4              0.576847              0.576713              0.575015   
4    5              0.717469              0.722515              0.725107   

   Natopsdimension10_t4  Natopsdimension10_t5  Natopsdimension10_t6  \
0              0.599099              0.606181              0.620752   
1              0.621909              0.621940              0.622165   
2              0.588034              0.587961              0.587895   
3              0.575267              0.572163              0.555767   
4              0.726653              0.709988              0.712092   

   Natopsdimension10_t7  Natopsdimension10_t8