### Import libraries

In [None]:
import numpy as np
import pandas as pd
import feather
import itertools

In [None]:
from utilityFuncs import get_problematic_features
from utilityFuncs import remove_problematic_datasets
from utilityFuncs import zScore

--------------------------------------

### Load calculated features

In [3]:
fullFeatMat = feather.read_dataframe('data/EmpFeatMat.feather')

--------------------------------------

### Some functions

Remove bad features and bad IDs.

In [None]:
filt1, goodIDs1 = remove_problematic_datasets(fullFeatMat)

Norm the values by each feature for each method.

In [None]:
filt2 = pd.DataFrame()

for method in filt1['method'].unique():

    methodFrame = filt1[filt1['method'] == method]
    normedFrame = methodFrame.groupby('names').apply(zScore)
    
    filt2 = pd.concat([filt2, normedFrame], ignore_index=True)
    
filt2

In [None]:
# store to use for PCA
filt2.to_feather('data/fullFeatMatFilt.feather')

--------------------------------------

Now adding a column called _comb_id_ which stores the method name combined with the feature name.

In [None]:
filt2['comb_id'] = filt2.apply(lambda row: row['method'] + '_' + row['names'], axis=1)

del filt1

--------------------------------------

### Compute correlation

Next step: take every possible combination of 2 methods and calculate the spearman correlation between all features of the two methods.

In [None]:
# create a list with all the methods used and get all the possible combinations
# take every possible combination of 2 methods and calculate the spearman correlation between all features of the two methods
methods = filt2['method'].unique()
methodCombinations = list(itertools.combinations(methods, r=2))

i = 0

# compute the spearman correlation of each method combination
for combination in methodCombinations:
    
    print('Doing: ', combination)
    
    corrDF = pd.DataFrame(columns=['method1', 'method2', 'feat1', 'feat2', 'corr'])
    
    # stores the feature names of method 1
    frame1 = pd.pivot_table(filt2[filt2['method'] == combination[0]], index=['id'], columns=['comb_id'])
    frame2 = pd.pivot_table(filt2[filt2['method'] == combination[1]], index=['id'], columns=['comb_id'])
    
    # stores all the correlations between each pair of features
    allCorr = pd.concat([frame1, frame2], axis=1).corr(method='spearman').filter(frame2.columns).filter(frame1.columns, axis=0)
    
    # having the correlations in on column, the feature names as indeces
    tmpDF = allCorr.stack()
    
    # rename indeces and column to be able to reset the index
    tmpDF.index.names = [None, 'feat1', 'feat2']
    tmpDF.columns = ['corr']
    corrDF = tmpDF.reset_index().iloc[: , 1:]
    
    # adds the method names
    l = len(corrDF)
    corrDF.insert(loc=0, column='method2', value=l * [combination[1]])
    corrDF.insert(loc=0, column='method1', value=l * [combination[0]])
    
    #corrDF.to_feather('corrMatsNorm/corrMat' + str(i) + '.feather')
    corrDF.to_feather('corrMats/corrMat' + str(i) + '.feather')
    i += 1

print('\nDone.')