In [4]:
import glob
import numpy as np
from scipy import stats

In [5]:
def Score(X,num_obj_cat):
    """
    IN:
    X: [N,M] array, where M is the number of features and N the number of objects.
    num_obj_cat: array containing number of objects for each class

    OUT:
    Tc: Scatter distance
    """

    [numSamples,Dim] = X.shape

    numCat = num_obj_cat.size

    u = np.mean(X,axis=0)
    s = np.std(X,axis=0,ddof=1)

    B = X - u
    Z = B/s


    ind = np.cumsum(num_obj_cat)
    ind = np.concatenate(([0],ind))

    uCat = np.zeros([numCat,Dim])
    for k in range(numCat):
        data_class = Z[ind[k]:ind[k+1]]
        uCat[k] = np.mean(data_class,axis=0)

    X_aux = Z.copy()
    for k in range(numCat):
        X_aux[ind[k]:ind[k+1]] -=  uCat[k]

    Sw = np.zeros([Dim,Dim]) # Within-cluster scatter matrix
    Sb = np.zeros([Dim,Dim]) # Between-cluster scatter matrix
    for k in range(numCat):
        data_class = X_aux[ind[k]:ind[k+1]]

        Sw += np.dot(data_class.T,data_class) 

        aux = (uCat[k]-0.).reshape([1,Dim])
        Sb += num_obj_cat[k]*np.dot(aux.T,aux)


    C = np.dot(np.linalg.inv(Sw),Sb)
    Tc = np.trace(C)

    return Tc

In [6]:
def read_original_breakpoints(samples_breakpoints,n):
    samples_breakpoints = open(samples_breakpoints,'r').read().split('\n')[:-1]
    total_series = len(samples_breakpoints)
    slopes = []
    breakpoints = []
    preds = []
    idxs = []
    for i in range(0,total_series,4):
        idx = int(samples_breakpoints[i]) - 1
        
        slopes_i = [float(n) for n in samples_breakpoints[i+1].split(' ')]
        breakpoints_i = [float(n) for n in samples_breakpoints[i+2].split(' ')]
        preds_i = [float(n) for n in samples_breakpoints[i+3].split(' ')]
        if len(slopes_i) == n:
            idxs.append(idx)
            slopes.append(np.asarray(slopes_i))
            breakpoints.append(np.asarray(breakpoints_i))
            preds.append(np.asarray(preds_i))
    
    return np.asarray(idxs),np.asarray(slopes),np.asarray(breakpoints),np.asarray(preds)

def read_file(samples_breakpoints):
    samples_breakpoints = open(samples_breakpoints,'r').read().split('\n')[:-1]
    total_series = len(samples_breakpoints)
    slopes = []
    breakpoints = []
    idxs = []
    for i in range(0,total_series,3):
        idx = int(samples_breakpoints[i]) - 1
        
        slopes_i = [float(n) for n in samples_breakpoints[i+1].split(' ')]
        breakpoints_i = [float(n) for n in samples_breakpoints[i+2].split(' ')]
        # breakpoints_i.append(1.0)

        idxs.append(idx)
        slopes.append(np.asarray(slopes_i))
        breakpoints.append(np.asarray(breakpoints_i))
    
    return np.asarray(idxs),np.asarray(slopes),np.asarray(breakpoints)

def breakpoints2intervals(x):
    intervals = [x[0]]
    for i in range(len(x)-1):
        intervals.append(x[i+1]-x[i])
    intervals.append(1-x[-1])
    return intervals

def norm_data(slopes_original,intervals_original,slopes_artificial,intervals_artificial):
    original_data = np.concatenate((slopes_original,intervals_original),axis=1)
    artificial_data = np.concatenate((slopes_artificial,intervals_artificial),axis=1)
    all_data = np.concatenate((original_data,artificial_data),axis=0)

    m = np.mean(all_data,axis=0)
    std = np.std(all_data,axis=0)
    original_data = (original_data - m)/std
    artificial_data = (artificial_data -m)/std
    
    all_data = (all_data - m)/std
    return original_data,artificial_data,all_data

In [7]:
filenames = sorted(glob.glob('data/plos_one_artificial_*.txt'))
original_data_filename = 'data/plos_one_total_breakpoints_k4it.max100stop.if.errorFALSE_original_data_filtered.txt'

In [8]:
filenames

['data/plos_one_artificial_all_random_2.txt',
 'data/plos_one_artificial_all_random_3.txt',
 'data/plos_one_artificial_all_random_4.txt',
 'data/plos_one_artificial_all_random_5.txt',
 'data/plos_one_artificial_intervals_slope_axis0_2.txt',
 'data/plos_one_artificial_intervals_slope_axis0_3.txt',
 'data/plos_one_artificial_intervals_slope_axis0_4.txt',
 'data/plos_one_artificial_intervals_slope_axis0_5.txt',
 'data/plos_one_artificial_intervals_slope_random_2.txt',
 'data/plos_one_artificial_intervals_slope_random_3.txt',
 'data/plos_one_artificial_intervals_slope_random_4.txt',
 'data/plos_one_artificial_intervals_slope_random_5.txt',
 'data/plos_one_artificial_intervals_slopes_2.txt',
 'data/plos_one_artificial_intervals_slopes_3.txt',
 'data/plos_one_artificial_intervals_slopes_4.txt',
 'data/plos_one_artificial_intervals_slopes_5.txt',
 'data/plos_one_artificial_slopes_interval_axis0_2.txt',
 'data/plos_one_artificial_slopes_interval_axis0_3.txt',
 'data/plos_one_artificial_slopes_

In [13]:
for f in filenames:
    print(f)
    n = int(f.split('_')[-1].split('.txt')[0])
    _,slopes,breakpoints,_ = read_original_breakpoints(original_data_filename,n)
    slopes_original = np.asarray([(np.arctan(s)*57.2958) for s in slopes])
    intervals_original = np.asarray([np.asarray(breakpoints2intervals(b)) for b in breakpoints])
#     print(slopes_original[:10])
#     print(intervals_original[:10])
    
    _,slopes_artificial,intervals_artificial = read_file(f)
#     print(slopes_artificial[:10])
#     print(intervals_artificial[:10])

    original,artificial,all_data = norm_data(slopes_original,intervals_original,slopes_artificial,intervals_artificial)
    print(Score(all_data,np.array([len(original),len(artificial)])))

data/plos_one_artificial_all_random_2.txt
0.12632442088739662
data/plos_one_artificial_all_random_3.txt
0.787159695435364
data/plos_one_artificial_all_random_4.txt
2.6270670867630628
data/plos_one_artificial_all_random_5.txt
4.9476059642318715
data/plos_one_artificial_intervals_slope_axis0_2.txt
0.0001137556129124228
data/plos_one_artificial_intervals_slope_axis0_3.txt
0.00010705796349294838
data/plos_one_artificial_intervals_slope_axis0_4.txt
0.0001335999793633252
data/plos_one_artificial_intervals_slope_axis0_5.txt
3.0397153659597633e-05
data/plos_one_artificial_intervals_slope_random_2.txt
0.08709669651555463
data/plos_one_artificial_intervals_slope_random_3.txt
0.42244704467550637
data/plos_one_artificial_intervals_slope_random_4.txt
0.3621055456594155
data/plos_one_artificial_intervals_slope_random_5.txt
0.283289090080468
data/plos_one_artificial_intervals_slopes_2.txt
6.854462299098515e-05
data/plos_one_artificial_intervals_slopes_3.txt
8.749954547236636e-05
data/plos_one_artific