In [1]:
import numpy as np
import umap
from fbpca import pca
from ipywidgets import interact, interact_manual
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import csv
import pandas as pd

In [2]:
import warnings
import os
warnings.simplefilter('ignore')

os.chdir('/Users/bdemeo/Documents/bergerlab/lsh/ample/bin')
#from datatools import *
#from dataset import *

os.chdir('/Users/bdemeo/Documents/bergerlab/lsh/ample')

In [20]:
class dataset(): #an ordered, annotated set of points
    def __init__(self, data, ft=None, path='', randomize=False):
        self.data = pd.DataFrame(data)
        self.numObs, self.numFeatures = data.shape

        if(randomize): #random datapoint ordering
            randomOrder=np.random.choice(self.numObs,self.numObs, replace=False)
            self.data = self.data[randomOrder,:]

        self.path = path #where things computed on this data will be saved
        self.subsamples = {} #sub-datasets, each with its own embedding
        self.embedding = None #computed lazily as needed


    #UMAP to embed some of the points, carrying over their annotations
    def make_embedding(self, max_size=20000):
        size=min(max_size, self.numObs)


        print('embedding size {}'.format(size))
        embedded = self.data.iloc[:size,:self.numFeatures]



        if self.numFeatures == 2:
            self.embedding = self.data

        else:
            reducer = umap.UMAP()
            #print(embedded.values)
            self.embedding = reducer.fit_transform(embedded.values)

        self.embedding = pd.DataFrame(self.embedding)

        df = self.data.iloc[:size,self.numFeatures:]
        annos = pd.DataFrame(df)



        self.embedding.index = range(self.embedding.values.shape[0])
        annos.index = range(annos.values.shape[0])
        self.embedding = pd.concat([self.embedding, annos], axis=1, join='inner')
        #self.embedding = pd.DataFrame(np.concatenate((self.embedding.values, annos.values), axis=1))

        print(self.embedding.values.shape)


    def load_subsample(self, path, name, delimiter = '\t'): #store subsample saved on disk
        with open(path) as f:
            reader = csv.reader(f, delimiter = delimiter)
            order = np.array(list(reader)).astype('int')[0]

            subsample = dataset(self.data[order,:], randomize=False, path=self.path+'/subsamples/')

            # for a in self.annos.keys():
            #     subsample.annos[a]=[self.annos[a][x] for x in order]

            self.subsamples[name] = subsample


    def make_subsample(self, downsampler, name, max_size=20000, **kwargs):

        #construct subsampler
        sampler_func = getattr(hashers, downsampler)
        sampler = sampler_func(data=self.data[:,:self.numFeatures], **kwargs)

        #make the subsample
        size = min(max_size, self.numObs)
        sampler.downsample(size)
        sample = downsampler.sample

        #bundle it into a dataset
        subsample = dataset(self.data[sample,:], randomize=False, path=self.path+'/subsamples/' )

        # for a in self.annos:
        #     subsample.annos[a]=[self.annos[a][x] for x in sample]


        if hasattr(downsampler, 'ptrs'):
            subsample['ptr'] = downsampler.ptrs


        self.subsamples[name] = subsample


    def sort_values(self, by, **kwargs):
        self.data = self.data.sort_values(by, **kwargs)
        
        if self.embedding is not None:
            print('sorting embedding')
            self.embedding = self.embedding.sort_values(by, **kwargs)



    def subset(self, annoName, value):

        result = dataset(self.data.loc[self.data[annoName] == value])
        result.numFeatures = self.numFeatures
        # anno = self.annos[annoName]
        # match_inds = []
        #
        # for i,a in enumerate(anno):
        #     if a == value:
        #         match_inds.append(i)


        #
        # result = dataset(self.data[match_inds,:])
        # for a in self.annos.keys():
        #     result.annos[a] = [self.annos[a][x] for x in match_inds]

        return(result)


    def grow(self, max_size=20000, cmap='Set1'):
#         if color is None:
#             color=self.annos.keys()

        if self.embedding is None:
            print('making embedding')
            self.make_embedding(max_size)

        max_n = min(self.embedding.shape[0],max_size)

        #annotations (besides coordinates) are possible colorings
        color = list(self.embedding.columns)[2:]


        @interact
        def build_plot(N=(1,max_n, 1), color=color):
            if color is None or len(color) == 0:
                colors = [1]*self.numObs
            else:
                le = LabelEncoder().fit(self.data.loc[:,color])
                colors = le.transform(self.data.loc[:,color])

            x=self.embedding.iloc[:N,0].values
            c=self.embedding.iloc[:N,:].loc[:,color].values.tolist()
            numPts = min([N, self.numObs])
            plt.scatter(self.embedding.values[:N,0],
                        self.embedding.values[:N,1],
                       c=colors[:N],
                        cmap = 'Set1')
            plt.legend()

    def grow_all(self, samples=None):
        if samples is None:
            samples = self.subsamples.keys()
        @interact
        def execute(sample=samples):
            self.subsamples[sample].grow()






    def embed_all(self, max_size=20000):
        for s in self.subsamples.values():
            s.make_embedding(max_size=max_size)

    def hasEmbedding(self):
        return(self.embedding is not None)

    def pca_dimred(self, n_components=100, filename = None):
        U,s,Vt = pca(self.data, k=n_components)
        dimred = U[:, :n_components]*s[:n_components]
        self.data = dimred

        if(filename is not None):
            np.savetxt(self.path+filename+'.txt', dimred, delimiter='\t')


In [21]:
import csv
import os
from sklearn import preprocessing

def open_data(NAMESPACE, parent_dir='data/', delimiter = '\t', dimred=True, annos = []):

    filename = 'dimred' if dimred else 'full'
    with open(parent_dir+NAMESPACE+'/'+filename+'.txt') as f:
        reader = csv.reader(f, delimiter = delimiter)
        data = np.array(list(reader)).astype(float) #assumes clean data, no column names

    result = dataset(data, path=parent_dir+NAMESPACE+'/')


    for anno in annos: #search for the indicated annotation file
        if os.path.exists(parent_dir+NAMESPACE+'/'+anno+'.txt'):
            with open(parent_dir+NAMESPACE+'/'+anno+'.txt') as f:
                reader = csv.reader(f, delimiter = delimiter)
                labels = np.array(list(reader))
                labels = [x[0] for x in labels]
                result.data[anno] = labels
        else:
            print('WARNING: could not find annotation {}'.format(anno))


    #search for subsamples
    if os.path.exists(parent_dir+NAMESPACE+'/ft.txt'):
        result.load_subsample(parent_dir+NAMESPACE+'/ft.txt', name='ft', delimiter=delimiter)


    return(result)


# def open_data(NAMESPACE, parent_dir='data/', delimiter = '\t', dimred=True, annos = []):
#
#     filename = 'dimred' if dimred else 'full'
#     with open(parent_dir+NAMESPACE+'/'+filename+'.txt') as f:
#         reader = csv.reader(f, delimiter = delimiter)
#         data = np.array(list(reader)).astype(float) #assumes clean data, no column names
#
#     result = dataset(data, path=parent_dir+NAMESPACE+'/')
#
#
#     for anno in annos: #search for the indicated annotation file
#         if os.path.exists(parent_dir+NAMESPACE+'/'+anno+'.txt'):
#             with open(parent_dir+NAMESPACE+'/'+anno+'.txt') as f:
#                 reader = csv.reader(f, delimiter = delimiter)
#                 labels = np.array(list(reader))
#                 labels = [x[0] for x in labels]
#                 result.data[anno] = labels
#         else:
#             print('WARNING: could not find annotation {}'.format(anno))
#
#
#     #search for subsamples
#     if os.path.exists(parent_dir+NAMESPACE+'/ft.txt'):
#         result.load_subsample(parent_dir+NAMESPACE+'/ft.txt', name='ft', delimiter=delimiter)
#
#
#     return(result)


In [34]:
zheng = open_data('zheng', annos=['patients','labels'])

In [35]:
zheng_sub = zheng.subset('patients','P1116')

In [36]:
zheng_sub.sort_values('labels')

In [37]:
zheng_sub.grow()

making embedding
embedding size 818
(818, 4)


aW50ZXJhY3RpdmUoY2hpbGRyZW49KEludFNsaWRlcih2YWx1ZT00MDksIGRlc2NyaXB0aW9uPXUnTicsIG1heD04MTgsIG1pbj0xKSwgRHJvcGRvd24oZGVzY3JpcHRpb249dSdjb2xvcicsIG/igKY=


In [38]:
zheng_sub.embedding

Unnamed: 0,0,1,patients,labels
0,-7.220150,0.261195,P1116,PTC
1,-7.053816,0.824679,P1116,PTC
2,-7.211232,0.435937,P1116,PTC
3,-7.248108,0.237032,P1116,PTC
4,-6.624517,0.084219,P1116,PTC
5,-7.123943,0.390335,P1116,PTC
6,-7.013472,0.053592,P1116,PTC
7,-6.798837,-0.186369,P1116,PTC
8,-6.416488,0.531805,P1116,PTC
9,2.160549,-3.912886,P1116,PTC
