# Reading the Data

In [61]:
import os
import pandas as pd
from scipy.io import arff
from sklearn.cluster import KMeans
import numpy as np
from functools import reduce

def read_arff_file(filepath, dimension_name):
    """
    - Loads an ARFF into a DataFrame.
    - Decodes any byte-strings to UTF-8 str.
    - Attempts to cast numeric columns.
    - Renames the ARFF 'class' or 'classAttribute' → 'class',
      and prefixes all other attributes with '{dimension_name}_'.
    """
    raw_data, _ = arff.loadarff(filepath)
    df = pd.DataFrame(raw_data)

    # 1) Decode bytes → str
    for col in df.select_dtypes([object]):
        df[col] = df[col].apply(
            lambda x: x.decode('utf-8') if isinstance(x, bytes) else x
        )

    # 2) Numeric cast where possible
    for col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col])
        except (ValueError, TypeError):
            pass

    # 3) Rename: unify any 'class' or 'classAttribute' to 'class';
    #    prefix everything else with dimension_name_
    new_cols = []
    for col in df.columns:
        low = col.lower()
        if low == 'class' or low == 'classattribute':
            new_cols.append('class')
        else:
            new_cols.append(f"{dimension_name}_{col}")
    df.columns = new_cols

    return df

# Phase 1: Combine NATOPS ARFF Files

In [62]:
def concat_and_sid(dfs):
    """
    - Horizontally concat a list of DataFrames (one per dimension).
    - Drop any duplicated columns.
    - Insert a 1-based 'sid'.
    """
    if not dfs:
        return pd.DataFrame()
    df = pd.concat(dfs, axis=1)
    df = df.loc[:, ~df.columns.duplicated()]
    df.insert(0, 'sid', range(1, len(df) + 1))
    return df

def process_dataset_folder(folder_path, dataset_name):
    """
    - Reads all .arff in folder_path.
    - Splits them into train/test by filename.
    - Builds a wide table per split, adds sid, then stacks train+test.
    - Tags with a `dataset` column.
    """
    arff_files = [
        f for f in os.listdir(folder_path)
        if f.lower().endswith('.arff')
    ]
    train_dfs, test_dfs = [], []

    for fn in arff_files:
        name_upper = fn.upper()
        if 'TRAIN' in name_upper:
            split = 'train'
        elif 'TEST' in name_upper:
            split = 'test'
        else:
            print(f"⚠️  Skipping unrecognized file: {fn}")
            continue

        base = os.path.splitext(fn)[0]
        dimension = base.upper().replace('_TRAIN','').replace('_TEST','').lower()
        df_dim = read_arff_file(os.path.join(folder_path, fn), dimension)
        df_dim['split'] = split

        if split == 'train':
            train_dfs.append(df_dim)
        else:
            test_dfs.append(df_dim)

    df_train = concat_and_sid(train_dfs)
    df_test  = concat_and_sid(test_dfs)
    df_all   = pd.concat([df_train, df_test], ignore_index=True)

    # Quick sanity check
    feature_cols = [
        c for c in df_all.columns
        if c not in ('sid','split','class')
    ]
    if not feature_cols:
        raise RuntimeError("No feature columns found in wide DataFrame. "
                           "Check that your .arff files were read correctly.")
    df_all['dataset'] = dataset_name
    return df_all

def melt_to_time_steps(df):
    """
    Turn the wide table (one row per sid) into long form:
    - one row per (sid, time_step)
    - columns: sid, split, class, time_step, plus one column per dimension
    """
    id_vars = [c for c in ('sid','split','class') if c in df.columns]

    # all dimension prefixes (anything before the first '_', excluding id_vars+dataset)
    dims = sorted({
        col.split('_')[0]
        for col in df.columns
        if '_' in col and col.split('_')[0] not in id_vars + ['dataset']
    })

    if not dims:
        raise RuntimeError("No dimensions detected—nothing to melt.")

    melted_dfs = []
    for dim in dims:
        prefix = f"{dim}_"
        time_cols = [c for c in df.columns if c.startswith(prefix)]
        if not time_cols:
            continue

        # stable sort by suffix, attempting numeric if possible
        def sort_key(c):
            suf = c[len(prefix):]
            return int(suf) if suf.isdigit() else suf

        time_cols = sorted(time_cols, key=sort_key)

        m = df[id_vars + time_cols].melt(
            id_vars=id_vars,
            value_vars=time_cols,
            var_name='time',
            value_name=dim
        )
        m['time_step'] = m['time'].map({col: i
                                         for i, col in enumerate(time_cols)})
        m = m.drop(columns='time')
        melted_dfs.append(m)

    if not melted_dfs:
        raise RuntimeError("After filtering, no dimension had time-step columns.")

    # merge them on sid/split/class/time_step
    df_long = melted_dfs[0]
    for df_next in melted_dfs[1:]:
        df_long = pd.merge(
            df_long, df_next,
            on=['sid','split','class','time_step']
        )
    return df_long

# Phase 2: Clustering & Atomic-Unit Generation

In [63]:
def perform_clustering(df, feature_cols, n_clusters, random_state=42):
    """
    Runs KMeans on the numeric subset of feature_cols,
    writes the cluster label into df['cluster'], and returns df.
    """
    numeric = [c for c in feature_cols
               if np.issubdtype(df[c].dtype, np.number)]
    dropped = set(feature_cols) - set(numeric)
    if dropped:
        print(f"Dropped non-numeric before clustering: {dropped}")

    X = df[numeric].to_numpy()
    km = KMeans(n_clusters=n_clusters, random_state=random_state)
    df['cluster'] = km.fit_predict(X)
    return df

def compute_ratio_features(df, n_clusters):
    """
    For each (split, sid, class) group, compute the normalized
    counts of each cluster → cluster_i_ratio features.
    """
    group_cols = [c for c in ('split','sid','class') if c in df.columns]
    ratios = (
        df
        .groupby(group_cols)['cluster']
        .value_counts(normalize=True)
        .unstack(fill_value=0)
    )
    ratios.columns = [f"cluster_{int(c)}_ratio"
                      for c in ratios.columns]
    return ratios.reset_index()

# Testing

In [64]:
def main():
     # Phase 1: ingest wide
    base_folder = os.path.join('Phase1_Data','NATOPS')
    if not os.path.isdir(base_folder):
        raise FileNotFoundError(f"NATOPS folder not found at {base_folder}")

    df_wide = process_dataset_folder(base_folder, 'NATOPS')
    print("Phase 1 (wide) sample:")
    print(df_wide.head(), "\n")
    df_wide.to_csv('Phase1_NATOPS_Combined_wide.csv', index=False)

    # Phase 1: reshape to long
    df_long = melt_to_time_steps(df_wide)

    # --- Begin header-renaming block ---
    # 1) Create isTest (1 if test, else 0)
    df_long['isTest'] = (df_long['split'] == 'test').astype(int)

    # 2) Drop split, time_step, and the extra 'natops' column
    df_long = df_long.drop(columns=['split','time_step','dataset'], errors='ignore')

    # 3) Rename natopsdimension1…natopsdimension24 → fea1…fea24
    nat_cols = sorted(
        [c for c in df_long.columns if c.startswith('natopsdimension')],
        key=lambda x: int(x.replace('natopsdimension',''))
    )
    rename_dict = {col: f"fea{i+1}" for i, col in enumerate(nat_cols)}
    df_long = df_long.rename(columns=rename_dict)

    # 4) Reorder to isTest, fea1…fea24, sid, class
    feature_cols = [f"fea{i}" for i in range(1, len(nat_cols)+1)]
    df_long = df_long[['isTest'] + feature_cols + ['sid', 'class']]
    # --- End header-renaming block ---

    print("Phase 1 (long – corrected) sample:")
    print(df_long.head(), "\n")
    df_long.to_csv('Phase1_NATOPS_Combined_long.csv', index=False)

    # Phase 2: clustering & ratio features
    n_clusters = 10
    feat_cols = [c for c in df_long.columns
                 if c not in ('sid','split','class','time_step')]
    df_clust = perform_clustering(df_long.copy(), feat_cols, n_clusters)
    df_phase2 = compute_ratio_features(df_clust, n_clusters)

    print("Phase 2 sample:")
    print(df_phase2.head(), "\n")
    df_phase2.to_csv('Phase2_NATOPS_AtomicUnits.csv', index=False)

if __name__ == '__main__':
    main()

Phase 1 (wide) sample:
   sid  natopsdimension10_channel_9_0  natopsdimension10_channel_9_1  \
0    1                       0.599967                       0.597535   
1    2                       0.622368                       0.622228   
2    3                       0.588525                       0.588389   
3    4                       0.576847                       0.576713   
4    5                       0.717469                       0.722515   

   natopsdimension10_channel_9_2  natopsdimension10_channel_9_3  \
0                       0.597007                       0.599099   
1                       0.622004                       0.621909   
2                       0.588164                       0.588034   
3                       0.575015                       0.575267   
4                       0.725107                       0.726653   

   natopsdimension10_channel_9_4  natopsdimension10_channel_9_5  \
0                       0.606181                       0.620752   
1      