In [1]:
import numpy as np
import pandas as pd
import os
import logging

In [2]:
logger = logging.getLogger()
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )

In [3]:
def flat_list(l):

    # make sure it is list of lists
    my_list = [x if isinstance(x, (list,)) else [x] for x in l]

    # return the flattened list
    out = [item for sublist in my_list for item in sublist]
    return np.array(out)


In [4]:
def best_class(df):
    '''
    Returns a list with the names of all the optimal/best classes
    '''
    class_name = df['ClassName']
    prob = df['Prob']
    out = [class_name[n][np.argmax(prob[n])] for n in range(class_name.shape[0])]
    return out

In [5]:
def aggregator(df, column_names, norm):
    N = df.shape[0]
    temp = pd.DataFrame(np.zeros([N, len(column_names)]), columns=column_names)
    for i, key in enumerate(df.index):
        labels = df.loc[key, 'ClassName']
        prob = df.loc[key, 'Prob']
        temp.loc[i, labels] = prob

    raw_data = temp.copy()
    raw_data.insert(0, 'model_class', df['model_class'].values)

    if norm == 'median':
        out = temp.median(axis=0)
    elif norm == 'mean':
        out = temp.mean(axis=0)
    elif norm == 'mutual_information':
# mutual info only needs raw_data. I create variable out just to full the program and keeps going without crashing.
# Out here is just a dummy
        out = temp.mean(axis=0)
    else:
        print('NORM should be either "mean", "media" or "mutual_information"')
        out = None

#     print(out.sum())
    return out, raw_data


In [6]:
def stripper(x, n):
    for i in range(n):
        if x.rfind('.') > 0:
            x = x[:x.rfind('.')]
    return x

In [7]:
def mk_dir(target):
    if not os.path.exists(target):
        os.makedirs(target)

In [8]:
def paramGrid(alpha, beta):
    grid = np.meshgrid(alpha, beta)
    grid = np.array(grid).T.reshape(-1, 2)
    return grid

In [9]:
def grouper_helper(A, B):
    sums = {}

    #check if B is number and cast as a list
    if isinstance(B, (int, float)):
        B = [B]
    
    for key, value in zip(A,B):
        try:
            sums[key] += value
        except KeyError:
            sums[key] = value
            
    key_list = list(sums.keys())
    value_list = list(sums.values())
    return key_list, value_list

In [10]:
def grouper(mydf, fold):
    names = []
    prob = []
    for i in range(mydf.shape[0]):
        n = [stripper(x, fold) for x in mydf.ClassName.iloc[i]]
        p = mydf['Prob'].iloc[i]
        _names, _prob = grouper_helper(n, p)
        names.append(_names)
        prob.append(_prob)
    
    mydf['ClassName'] = names
    mydf['Prob'] = prob
    return mydf

In [11]:
def pool(df):
#     print('Pooling all Non Neurons together.')
    non_neuron = ['Astro.1',
                 'Astro.2',
                 'Astro.3',
                 'Astro.4',
                 'Astro.5',
                 'Choroid',
                 'Endo',
                 'Eryth.1',
                 'Eryth.2',
                 'Microglia.1',
                 'Microglia.2',
                 'Oligo.1',
                 'Oligo.2',
                 'Oligo.3',
                 'Oligo.4',
                 'Oligo.5',
                 'Vsmc'
                 ]

    class_names = df['ClassName']
    prob = df['Prob']
    out_names = []
    out_prob = []
    for key, name in enumerate(class_names):
        name = ['NonNeuron' if x in non_neuron else x for x in name]
        temp = pd.DataFrame({'class_name': name, 'prob': prob[key]} )
        temp = temp.groupby('class_name').sum()

        out_names.append(temp.index.tolist())
        out_prob.append(temp['prob'].tolist())

    df['ClassName'] = out_names
    df['Prob'] = out_prob
    return df


In [12]:
def confusion_matrix(model_data, sim_data, config):
    
    fold = config['fold']
    norm = config['norm']
    
    'get the model class, ie most likely as this is derived from the model'
    _model_class = best_class(model_data)

    df = sim_data[["Cell_Num", "ClassName", "Prob"]]
    df = df.assign(model_class=_model_class)
    
    # if you fold, strip the substrings from the names
    df = grouper(df, fold)
    
    # remove the substring from the model class too
    mc = [stripper(x, fold) for x in df.model_class]
    df.model_class = mc

    all_class_names = sorted(set(flat_list(df.ClassName)))

    'get all the unique model_class names'
    umc = sorted(list(set(df.model_class)))
    
    n = len(umc)
    m = len(all_class_names)
    out = pd.DataFrame(np.zeros((m, n)), columns=umc, index=all_class_names)

    'loop over the class names (those assigned my the model)'
    appended_data = []
    for c in umc:
        mask = df['model_class'] == c
        temp = df[mask]
        agg, raw_data = aggregator(temp, all_class_names, norm)

        # store DataFrame in list
        appended_data.append(raw_data)
        key = agg.index
        prob = agg.values
        out.loc[key, c] = prob
#         print('Finished with %s' % c)

    # concatenate along the index(axis=0), overwrite raw_data variable
    raw_data = pd.concat(appended_data, axis=0)
    return out, raw_data


In [13]:
def analytics(df):
    d = []  #keep here the elements of the diagonal
    model_classes = df.columns.values
    for c in model_classes:
        if c in df.index:
            # maybe i should append a zero if c not in the index.
            # Now I just ignore this.'
            d.append(df.loc[c, c])

    avg = np.mean(d)
    median = np.median(d)
    return avg, median

In [14]:
def mkPaths(config):
    alpha = config['alpha']
    beta = config['beta']
    PATH = 'https://raw.githubusercontent.com/acycliq/spacetx/master/dashboard/data/img/'
    MODEL_DATA =os.path.join(PATH, 'default_98genes/json/iss.json')

    subfolder = 'alpha' + str(alpha) + '_' + 'beta' + str(beta)
    fName = 'alpha' + str(alpha) + '_' + 'beta' + str(beta) + '_sims_iss.json'
    SIM_DATA = PATH + '/grid' + '/' + config['mode'] + '/' + subfolder + '/' + fName
    
    config['MODEL_DATA'] = MODEL_DATA
    config['SIM_DATA'] = SIM_DATA
    return config

In [15]:
def mutual_information(data):
    prob = data.iloc[:, 1:]
    model_class = data.iloc[:, 0]
    # first find the locations in each row where the max occurs
    mask = np.zeros(prob.shape)

    # loop over the model class=
    predictedNames = prob.columns
    sampleSize = prob.shape[0]
    for i, val in enumerate(model_class):
        # find the index in the predicted
        if val in predictedNames.tolist():
            col_id = predictedNames.tolist().index(val)
            mask[i, col_id] = 1

    marginals = mask.sum(axis=0) / mask.shape[0]
    # contribution = prob.values * mask / marginals
    contribution = np.divide(prob.values * mask, marginals, out=np.zeros_like(prob.values), where=marginals != 0)
    logContribution = np.log2(contribution, where=(contribution != 0))
    mutualInformation = np.sum(logContribution) / sampleSize

    return mutualInformation

In [16]:
def app(config):
    MODEL_DATA = config['MODEL_DATA'], 
    SIM_DATA = config['SIM_DATA']

#     print('in app')
#     print(config['MODEL_DATA'])
#     print(config['SIM_DATA'])
    
#     logger.info('reading %s' % config['MODEL_DATA'])
    model_data = pd.read_json(config['MODEL_DATA'])
    
#     logger.info('reading %s' % config['SIM_DATA'])
    sim_data = pd.read_json(config['SIM_DATA'])
    
    if config['groupNonNeurons']:
        model_data = pool(model_data)
        sim_data = pool(sim_data)
    
    cm, raw_data = confusion_matrix(model_data, sim_data, config)
#     avg, median = analytics(cm)
    # mi = mutual_information(raw_data)
    
    if config['norm'] == 'mutual_information':
        out = mutual_information(raw_data)
    else:
        avg, median = analytics(cm)
        out = avg
            

#     logger.info('done \r')
    return out


In [17]:
alpha = [0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]
beta = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
alpha = [0.25, 3.0]
beta = [0.0, 1.0]
grid = paramGrid(alpha, beta)
config = {}
modes = ['constrained', 'unconstrained']
groupNonNeurons = [True, False]
norms = ['mean']
folds = [0, 1]

maxIter = len(alpha) * len(beta) * len(modes) * len(groupNonNeurons) * len(norms) * len(folds)
out = []
iter = 0
for mode in modes:
    for isNNgrouped in groupNonNeurons:
        for norm in norms:
            for fold in folds:
                for p in grid:
                    iter = iter + 1

                    config['alpha'] = p[0]
                    config['beta'] = p[1]

                    config['mode'] = mode
                    config['groupNonNeurons'] = isNNgrouped
                    config['norm'] = norm
                    config['fold'] = fold

                    # print( "mode: %s, norm: %s, fold: %d "  % (mode, norm, fold) )
                    print( "Iteration: %d out of %d: Doing alpha: %.2f, beta: %.2f "  % (iter, maxIter, p[0], p[1]) + '\r',end='' )
                    config = mkPaths(config)

                    # start the app
                    res = app(config)

                    # mode, norm, fold, xKey, xLabel, yKey, yLabel, val
                    temp = [mode, isNNgrouped, norm, fold, p[0], str(p[0]), p[1], str(p[1]), res]
                    # append to array
                    out.append(temp)
                
                

Iteration: 32 out of 32: Doing alpha: 3.00, beta: 1.00 

In [18]:
out_df = pd.DataFrame(out)
out_df.columns = ['mode', 'groupNonNeurons', 'norm', 'fold', 'xKey', 'xLabel', 'yKey', 'yLabel', 'val']
out_df.to_csv('D:\Dimitris\Dropbox\_grid\cm_summary2.csv', index=False)
print('Done!')

Done!
