# Applicability of training ML model with TDA using Delaunay-Rips complex vs. using Rips vs. using Alpha
Author: Amish Mishra  
Date: November 1, 2022

## Notes
* We will use DR for "Delaunay-Rips"
* We will refer to the pipeline that uses DR, Rips, or Alpha for training/validating the corresponding ML model as the "DR method", "Rips method", or "Alpha method", respectively.
* Rename folders with 1, 2, 3,... ahead of them to show what order they are used in

## Import the necessary libraries

In [1]:
%pip install git+https://github.com/amish-mishra/cechmate_DR.git

Collecting git+https://github.com/amish-mishra/cechmate_DR.git
  Cloning https://github.com/amish-mishra/cechmate_DR.git to /tmp/pip-req-build-ld359cy5
  Running command git clone --filter=blob:none --quiet https://github.com/amish-mishra/cechmate_DR.git /tmp/pip-req-build-ld359cy5
  Resolved https://github.com/amish-mishra/cechmate_DR.git to commit 79eab760b46c32c4c978a6229e38bed6d8cd2179
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import time
import pandas
import pickle
import numpy as np
import cechmate as cm
import matplotlib.pyplot as plt
from ripser import ripser
from sklearn.svm import SVC
from scipy import stats
from sklearn import metrics
from scipy.stats import median_test
from persistence_stats import generate_training_validation_pers_stats
from train_ml_classifiers import train_ml_classifiers
from validate_ml_classifiers import validate_ml_classifiers

## 1. Generate Persistence Statistics from Persistence Diagrams using DR, Rips, and Alpha

In [4]:
types = ['Training', 'Validation']
methods = ['rips', 'alpha', 'del_rips']
for t in types:
    for m in methods:
        generate_training_validation_pers_stats(type_of_data=t, method=m, verbose=False)

---- Using del_rips ----
Loading CGMH_preprocessed_data/Validation/1.csv
Runtime for processing this file: 35.50304985046387
Loading CGMH_preprocessed_data/Validation/2.csv
Runtime for processing this file: 27.326595544815063
Loading CGMH_preprocessed_data/Validation/3.csv
Runtime for processing this file: 29.08176875114441
Loading CGMH_preprocessed_data/Validation/4.csv
Runtime for processing this file: 29.081491231918335
Loading CGMH_preprocessed_data/Validation/5.csv
Runtime for processing this file: 35.42776417732239
Loading CGMH_preprocessed_data/Validation/6.csv
Runtime for processing this file: 27.986520290374756
Loading CGMH_preprocessed_data/Validation/7.csv
Runtime for processing this file: 34.02066135406494
Loading CGMH_preprocessed_data/Validation/8.csv
Runtime for processing this file: 28.606672525405884
Loading CGMH_preprocessed_data/Validation/9.csv
Runtime for processing this file: 30.588467359542847
Loading CGMH_preprocessed_data/Validation/10.csv
Runtime for processin

## 2. Train ML models (SVM) based on Persistence Statistics

In [3]:
func_arr = ['rips', 'alpha', 'del_rips']
for func in func_arr:
    train_ml_classifiers(func)

Training classifer based on rips
SVC(class_weight='balanced', kernel='linear', probability=True)
########## Done training Classifier ###########
Training classifer based on alpha
SVC(class_weight='balanced', kernel='linear', probability=True)
########## Done training Classifier ###########
Training classifer based on del_rips
SVC(class_weight='balanced', kernel='linear', probability=True)
########## Done training Classifier ###########


## 3. Validate ML models

In [5]:
func_arr = ['rips', 'alpha', 'del_rips']
for func in func_arr:
    validate_ml_classifiers(func)

Validating rips svm...
               1           2           3           4           5           6   \
tp      80.000000   30.000000   12.000000  280.000000  193.000000   13.000000   
fp      57.000000   48.000000    6.000000  153.000000  246.000000   18.000000   
tn     747.000000  646.000000  673.000000  261.000000  352.000000  689.000000   
fn      48.000000    6.000000   35.000000   42.000000  117.000000   18.000000   
se       0.625000    0.833333    0.255319    0.869565    0.622581    0.419355   
sp       0.929104    0.930836    0.991163    0.630435    0.588629    0.974540   
acc      0.887339    0.926027    0.943526    0.735054    0.600220    0.951220   
pr       0.583942    0.384615    0.666667    0.646651    0.439636    0.419355   
f1       0.603774    0.526316    0.369231    0.741722    0.515354    0.419355   
auc      0.902985    0.932957    0.866199    0.799247    0.663518    0.887393   
aps      0.628206    0.529152    0.466837    0.680884    0.523445    0.432992   
kappa

## 4. Generate performance metrics

### Calculate the median and IQR for each method's performance metrics table

In [6]:
func_arr = ['rips', 'alpha', 'del_rips']
all_perf_stats_by_func = {'rips':0, 'alpha':0, 'del_rips':0}

for func in func_arr:
    print(f'========== {func} performance ==========')
    perf_metrics = pandas.read_pickle(
        f'performance_metrics_tables/perf_metrics_{func}_svm_classifier.pkl')
    summary_metrics = pandas.DataFrame({'median':[], 'iqr':[]})
    # print(perf_metrics)
    summary_metrics['median'] = perf_metrics.median(axis=1)
    quantile_75 = perf_metrics.quantile(0.75, axis=1)
    quantile_25 = perf_metrics.quantile(0.25, axis=1)
    summary_metrics['iqr'] = quantile_75 - quantile_25
    relavant_summary_metrics = summary_metrics.iloc[4:] # The median and IQR of the confusion matrix elements are not relevant
    all_perf_stats_by_func[func] = perf_metrics.iloc[4:]
    print(relavant_summary_metrics)

         median       iqr
se     0.565217  0.428794
sp     0.925065  0.271714
acc    0.852941  0.161052
pr     0.439636  0.316970
f1     0.517986  0.140499
auc    0.866199  0.083123
aps    0.594109  0.180079
kappa  0.393895  0.178491
         median       iqr
se     0.569892  0.384171
sp     0.917511  0.274028
acc    0.849655  0.155461
pr     0.444206  0.338701
f1     0.487973  0.132413
auc    0.863984  0.080200
aps    0.555532  0.160387
kappa  0.365943  0.193825
         median       iqr
se     0.555556  0.433198
sp     0.917271  0.272863
acc    0.852573  0.161564
pr     0.424307  0.301726
f1     0.472727  0.170183
auc    0.866559  0.096023
aps    0.552859  0.159532
kappa  0.356587  0.192436


### Perform a row-by-row median test pairwise between DR method, Rips method, and Alpha method

In [9]:
p_val_df = pandas.DataFrame({'p-value for rips vs alpha':[], 'p-value for rips vs del-rips':[],'p-value for alpha vs del-rips':[]})
for idx, row in all_perf_stats_by_func['rips'].iterrows():
    rips_row = all_perf_stats_by_func['rips'].loc[[idx]].values[0]
    alpha_row = all_perf_stats_by_func['alpha'].loc[[idx]].values[0]
    del_rips_row = all_perf_stats_by_func['del_rips'].loc[[idx]].values[0]
    _, p_r_a, _, _ = median_test(rips_row, alpha_row)
    _, p_r_d, _, _ = median_test(rips_row, del_rips_row)
    _, p_a_d, _, _ = median_test(alpha_row, del_rips_row)
    p_val_df.loc[len(p_val_df.index)] = [p_r_a, p_r_d, p_a_d]
p_val_df.index = all_perf_stats_by_func['rips'].index

In [11]:
print(p_val_df[['p-value for rips vs del-rips', 'p-value for alpha vs del-rips']])

       p-value for rips vs del-rips  p-value for alpha vs del-rips
se                         1.000000                            1.0
sp                         1.000000                            1.0
acc                        1.000000                            1.0
pr                         1.000000                            1.0
f1                         0.276303                            1.0
auc                        1.000000                            1.0
aps                        0.102470                            1.0
kappa                      1.000000                            1.0
