# Reading the Data

In [160]:
import os
import re
import pandas as pd
from scipy.io import arff
from functools import reduce

def read_arff_file(filepath, dimension_name):
    """
    - Load an ARFF into a DataFrame.
    - Decode byte-strings → UTF-8.
    - Cast numeric columns where possible.
    - Rename 'class'/'classAttribute' → 'class';
      prefix all other attrs with '{dimension_name}_'.
    """
    raw_data, _ = arff.loadarff(filepath)
    df = pd.DataFrame(raw_data)

    # 1) decode bytes
    for col in df.select_dtypes([object]):
        df[col] = df[col].apply(
            lambda x: x.decode('utf-8') if isinstance(x, bytes) else x
        )

    # 2) cast numeric
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='ignore')

    # 3) rename
    new_cols = []
    for col in df.columns:
        low = col.lower()
        if low in ('class','classattribute'):
            new_cols.append('class')
        else:
            new_cols.append(f"{dimension_name}_{col}")
    df.columns = new_cols

    return df

# Phase 1: Combine NATOPS ARFF Files

In [161]:
def concat_and_sid(dfs):
    """Horizontally concat, drop duplicate cols."""
    if not dfs:
        return pd.DataFrame()
    df = pd.concat(dfs, axis=1)
    return df.loc[:, ~df.columns.duplicated()]

def process_dataset_folder(folder):
    """Read all TRAIN/TEST ARFFs, split, concat per split, stack, add global sid."""
    files = sorted(os.listdir(folder))
    train_dfs, test_dfs = [], []
    for fn in files:
        if not fn.lower().endswith('.arff'):
            continue
        u = fn.upper()
        split = 'train' if 'TRAIN' in u else 'test'
        base = os.path.splitext(fn)[0]
        dim = base.upper().replace('_TRAIN','').replace('_TEST','').lower()
        d = read_arff_file(os.path.join(folder, fn), dim)
        d['split'] = split
        (train_dfs if split=='train' else test_dfs).append(d)

    df_train = concat_and_sid(train_dfs)
    df_test  = concat_and_sid(test_dfs)
    df_all   = pd.concat([df_train, df_test], ignore_index=True)

    # assign global sid = 1…N
    df_all = df_all.reset_index(drop=True)
    df_all.insert(0, 'sid', range(1, len(df_all)+1))
    return df_all

def melt_to_long(df):
    """Turn wide → long: one row per (sid, split, class, time_step)."""
    id_vars = [c for c in ('sid','split','class') if c in df.columns]
    pat = re.compile(r'^(?P<prefix>.+)_(?P<step>\d+)$')
    groups = {}
    for c in df.columns:
        m = pat.match(c)
        if m and c not in id_vars:
            groups.setdefault(m.group('prefix'), []).append(c)

    melted = []
    for prefix, cols in sorted(groups.items()):
        cols_sorted = sorted(cols, key=lambda x: int(pat.match(x).group('step')))
        m = df[id_vars + cols_sorted].melt(
            id_vars=id_vars,
            value_vars=cols_sorted,
            var_name='time',
            value_name=prefix
        )
        m['time_step'] = m['time'].str.extract(r'_(\d+)$').astype(int)
        melted.append(m.drop(columns='time'))

    # merge on sid, split, class, time_step
    df_long = reduce(
        lambda L, R: pd.merge(L, R, on=['sid','split','class','time_step']),
        melted
    )
    return df_long

# Phase 2: Clustering & Atomic-Unit Generation

In [162]:
def perform_clustering(df, feature_cols, n_clusters, random_state=42):
    numeric = [c for c in feature_cols if np.issubdtype(df[c].dtype, np.number)]
    df = df.copy()
    df['cluster'] = KMeans(n_clusters=n_clusters, random_state=random_state).fit_predict(df[numeric])
    return df

def compute_ratio_features(df, n_clusters):
    group_cols = [c for c in ('split','sid','class') if c in df.columns]
    ratios = (
        df
        .groupby(group_cols)['cluster']
        .value_counts(normalize=True)
        .unstack(fill_value=0)
    )
    ratios.columns = [f"cluster_{int(c)}_ratio" for c in ratios.columns]
    return ratios.reset_index()

# Testing

In [163]:
def main():
    base_folder = os.path.join('Phase1_Data', 'NATOPS')
    df_wide = process_dataset_folder(base_folder)

    # Phase 1: sample–time long form
    df_long = melt_to_time_steps(df_wide)

    # 1) isTest column
    df_long['isTest'] = (df_long['split'] == 'test').astype(int)

    # 2) drop extra columns
    df_long = df_long.drop(columns=['split','time_step'], errors='ignore')

    # 3) rename natopsdimension… → fea1…fea24
    dim_cols = sorted(
        [c for c in df_long.columns if c.startswith('natopsdimension')],
        key=lambda x: int(re.search(r'natopsdimension(\d+)', x).group(1))
    )
    rename_map = {col: f"fea{i+1}" for i, col in enumerate(dim_cols)}
    df_long = df_long.rename(columns=rename_map)

    # 4) reorder to isTest, fea1…fea24, sid, class
    final_cols = ['isTest'] + [f"fea{i}" for i in range(1, 25)] + ['sid','class']
    df_long = df_long[final_cols]

    df_long = df_long[df_long['sid'].between(1, 24)]


    df_long.to_csv('Phase1_NATOPS_Combined_long.csv', index=False)
    print(f"Saved {len(df_long)} rows to Phase1_NATOPS_Combined_long.csv")

    # Phase 2 clustering
    feat_cols = [c for c in df_long.columns if c.startswith('fea')]
    df_clust = perform_clustering(df_long, feat_cols, n_clusters=10)
    df_phase2 = compute_ratio_features(df_clust, n_clusters=10)
    df_phase2.to_csv('Phase2_NATOPS_AtomicUnits.csv', index=False)

if __name__ == '__main__':
    main()

  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = 

Saved 1224 rows to Phase1_NATOPS_Combined_long.csv
