In [1]:
import numpy as np
import scipy.stats as spy
import pandas as pd
import copy


In [2]:
pickle_fns = ['spheres_angles_end', 'skel_distances', 'curvatures_new']

In [3]:
def getAll(measure_array,id_name):
    return measure_array

def getSelection(measure_array, id_name, selection):
    ids = selection[id_name]
    return measure_array[ids]

def getPerc(measure_array, id_name,perc=2.5):
    min_s = np.percentile(measure_array,perc)
    max_s = np.percentile(measure_array,100-perc)
    return measure_array[(measure_array > min_s) & (measure_array < max_s)]

def getZ(measure_array, id_name, SD=3):
    Z = np.abs((measure_array - np.mean(measure_array))/np.std(measure_array))
    return measure_array[Z < SD]

def getSample(measure_array, n=10000):
    data = np.random.choice(measure_array, size=n, replace=True)
    return data

def getMannWhit(input_data, ids, measure_name,get_func=getAll, sample_size = None, **kwargs):
    
    size = len(ids)
    data_array = list()
    num_vals = np.zeros(size,dtype=int)
    for i in range(size):
        samp = ids[i]
        
        data_raw = input_data[samp][measure_name]
        data = get_func(data_raw,samp,**kwargs)
        if sample_size:
            data = getSample(data, sample_size)
        data_array.append(copy.deepcopy(data))
        num_vals[i] = len(data)
    stat_arr, pval_arr = calcMannWhitArray(data_array, size)
    return {'statistic':stat_arr, 'p-values': pval_arr, 'sizes': num_vals}

def calcMannWhitArray(data_arrays, size):

    stat_arr = np.zeros((size,size))
    pval_arr = np.zeros((size,size))
    for i in range(size):
        d1 = data_arrays[i]
        stat_arr[i,i], pval_arr[i,i] = spy.ks_2samp(d1,d1)
        for j in range(size):
            if j > i:
                stat, pval = spy.mannwhitneyu(d1, data_arrays[j])
                stat_arr[i,j] = stat
                stat_arr[j,i] = stat
                pval_arr[i,j] = pval
                pval_arr[j,i] = pval
            
    return stat_arr, pval_arr


def getKeys(data):
    for coral in data:
        data_arr = data[coral]
        keys = list()
        for key in data_arr:
            numbers = data_arr[key]
            if len(numbers.shape) == 1 and isinstance(numbers[0], float):
                keys.append(key)
        return keys

    

In [4]:
species_info = pd.read_csv(f'{vtk.DIR_DATA}/species_info_v1.csv')
species_info = species_info.sort_values(by='Morphospecies')
ids = list(species_info['Sample ID'])
del species_info

In [5]:
def prepareSelection(fn_selection, selection_name, ids):
    selection = load_data.readPickle(fn_selection)
    selection_dict = {}
    for coral_id in ids:
        selection_dict[coral_id] = selection[coral_id][selection_name]
    return selection_dict
        
    

In [6]:
selection_combos = {
    'long': {
        'fn_choose': 'skel_distances',
        'selections': 'long_enough',
        'data': 'skel_distances',
        'measures': ['br_rate1', 'br_rate2'],
        'sample_size': 70 
    },
    'tips': {
        'fn_choose': 'poly_ends',
        'selections': 'point_ids',
        'data': 'curvatures_new',
        'measures': ['Mean','Gauss', 'Minimum', 'Maximum'],
        'sample_size': 10000
    }
    
}

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LogNorm

# Example

In [9]:
ks_measures = {}
ss = 1000
  
for fn in pickle_fns:
    data = load_data.readPickle(fn)
    keys = getKeys(data)
    for i in range(10):
        for key in keys:
            print(key)
            ks_test = getMannWhit(data,ids,key, sample_size=ss)
            np.savetxt(f"{vtk.DIR_RESULTS}/mwpval/manwhit_{fn}_{key}_size{ss}_{i}.csv", ks_test['p-values'], delimiter=",")
            np.savetxt(f"{vtk.DIR_RESULTS}/mwstat/manwhit_{fn}_{key}_size{ss}_{i}.csv", ks_test['statistic'], delimiter=",")
    del data, keys


da
db
dc
angle
da
db
dc
angle
da
db
dc
angle
da
db
dc
angle
da
db
dc
angle
da
db
dc
angle
da
db
dc
angle
da
db
dc
angle
da
db
dc
angle
da
db
dc
angle
br_rate1
br_rate2
br_spacing1
br_spacing2
br_rate1
br_rate2
br_spacing1
br_spacing2
br_rate1
br_rate2
br_spacing1
br_spacing2
br_rate1
br_rate2
br_spacing1
br_spacing2
br_rate1
br_rate2
br_spacing1
br_spacing2
br_rate1
br_rate2
br_spacing1
br_spacing2
br_rate1
br_rate2
br_spacing1
br_spacing2
br_rate1
br_rate2
br_spacing1
br_spacing2
br_rate1
br_rate2
br_spacing1
br_spacing2
br_rate1
br_rate2
br_spacing1
br_spacing2
Gauss
Mean
Minimum
Maximum
areas
Gauss
Mean
Minimum
Maximum
areas
Gauss
Mean
Minimum
Maximum
areas
Gauss
Mean
Minimum
Maximum
areas
Gauss
Mean
Minimum
Maximum
areas
Gauss
Mean
Minimum
Maximum
areas
Gauss
Mean
Minimum
Maximum
areas
Gauss
Mean
Minimum
Maximum
areas
Gauss
Mean
Minimum
Maximum
areas
Gauss
Mean
Minimum
Maximum
areas


In [10]:
ss = 1000
for select_method in selection_combos:
    select_array = selection_combos[select_method]
    print(select_array)
    data_name = select_array['data']
    data = load_data.readPickle(data_name)
    measures = select_array['measures']
    selection = prepareSelection(select_array['fn_choose'], select_array['selections'],ids)
    for i in range(10): 
        for key in measures:
            print(key)
            ks_test = getMannWhit(data,ids,key,get_func=getSelection, selection=selection, sample_size = ss)
            np.savetxt(f"{vtk.DIR_RESULTS}/mwpval/manwhit_{data_name}_{key}_{select_method}_size{ss}_{i}.csv", ks_test['p-values'], delimiter=",")
            np.savetxt(f"{vtk.DIR_RESULTS}/mwstat/manwhit_{data_name}_{key}_{select_method}_size{ss}_{i}.csv", ks_test['statistic'], delimiter=",")
    del data, selection


{'fn_choose': 'skel_distances', 'selections': 'long_enough', 'data': 'skel_distances', 'measures': ['br_rate1', 'br_rate2'], 'sample_size': 70}
br_rate1
br_rate2
br_rate1
br_rate2
br_rate1
br_rate2
br_rate1
br_rate2
br_rate1
br_rate2
br_rate1
br_rate2
br_rate1
br_rate2
br_rate1
br_rate2
br_rate1
br_rate2
br_rate1
br_rate2
{'fn_choose': 'poly_ends', 'selections': 'point_ids', 'data': 'curvatures_new', 'measures': ['Mean', 'Gauss', 'Minimum', 'Maximum'], 'sample_size': 10000}
Mean
Gauss
Minimum
Maximum
Mean
Gauss
Minimum
Maximum
Mean
Gauss
Minimum
Maximum
Mean
Gauss
Minimum
Maximum
Mean
Gauss
Minimum
Maximum
Mean
Gauss
Minimum
Maximum
Mean
Gauss
Minimum
Maximum
Mean
Gauss
Minimum
Maximum
Mean
Gauss
Minimum
Maximum
Mean
Gauss
Minimum
Maximum


In [11]:
import csv
ids1 = [ids]
with open(f'{vtk.DIR_RESULTS}/manwhit_ids.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    print(ids1)
    writer.writerows(ids1)

[['15Oki01', '18Oki13', '18Oki15', '18Oki16', '18Oki17', '18Oki18', '18Oki31', '18Oki32', '15Oki29', '18Oki33', '18Oki35', '18Oki36', '19Oki12', '19Oki14', '19Oki20', '19Oki22', '19Oki23', '18Oki34', '15Oki24', '18Oki14', '15Oki22', '15Oki23', '15Oki12', '15Oki14', '15Oki08', '15Oki06', '15Oki16', '15Oki03', '19Oki09', '19Oki07', '19Oki17', '19Oki18', '15Oki10', '15Oki04', '18Oki30', '18Oki29', '18Oki28', '18Oki27', '18Oki26', '18Oki25', '15Oki05', '15Oki09', '19Oki25', '18Oki10', '15Oki27', '18Oki07', '15Oki19', '18Oki12', '18Oki11', '18Oki09', '18Oki08', '18Oki02', '18Oki01', '19Oki13', '15Oki40', '19Oki15', '15Oki17', '19Oki16', '15Oki21', '15Oki26', '15Oki25', '15Oki02', '18Oki03', '19Oki05', '18Oki05', '19Oki01', '18Oki04', '15Oki15', '18Oki06', '15Oki11', '15Oki13', '19Oki24', '18Oki23', '18Oki22', '18Oki21', '18Oki20', '18Oki19', '19Oki02', '18Oki24']]
